根据 R 中的列表列表改变列
Mutate column based on list of lists in R
我有一个数据框,我想 gather
使其为高格式,然后 mutate
在另一列上,其值基于列表中另一列的字符串的成员资格列表。例如,我有以下数据框和列表列表:
dummy_data <- data.frame("id" = 1:20,"test1_10" = sample(1:100, 20),"test2_11" = sample(1:100, 20),
"test3_12" = sample(1:100, 20),"check1_20" = sample(1:100, 20),
"check2_21" = sample(1:100, 20),"sound1_30" = sample(1:100, 20),
"sound2_31" = sample(1:100, 20),"sound3_32" = sample(1:100, 20))
dummylist <- list(c('test1_','test2_','test3_'),c('check1_','check2_'),c('sound1_','sound2_','sound3_'))
names(dummylist) <- c('shipments','arrivals','departures')
然后我像这样收集数据框:
dummy_data <- dummy_data %>%
gather("part", "number", 2:ncol(.))
我想要做的是添加一个列,该列具有在 dummylist
中找到的列表的名称,其中部分列中下划线之前的字符串是一个成员。我可以这样做:
dummydata <- dummydata %>%
mutate(Group = case_when(
str_extract(part,'.*_') %in% dummylist[[1]] ~ names(dummylist[1]),
str_extract(part,'.*_') %in% dummylist[[2]] ~ names(dummylist[2]),
str_extract(part,'.*_') %in% dummylist[[3]] ~ names(dummylist[3])
))
但是,dummylist
中的每个 list/group 需要一个单独的 str_extract
行。而我的真实数据远不止 3 lists/groups。所以我想知道是否有更有效的方法来执行此变异步骤以获取列表的名称?
非常感谢任何帮助,谢谢!
将 'dummylist' 转换为两列数据集后,使用 regex_left_join
可能更容易
library(fuzzyjoin)
library(dplyr)
library(tidyr)
library(tibble)
dummy_data %>%
# // reshape to long format - pivot_longer instead of gather
pivot_longer(cols = -id, names_to = 'part', values_to = 'number') %>%
# // join with the tibble/data.frame converted dummylist
regex_left_join(dummylist %>%
enframe(name = 'Group', value = 'part') %>%
unnest(part)) %>%
rename(part = part.x) %>%
select(-part.y)
-输出
# A tibble: 160 × 4
id part number Group
<int> <chr> <int> <chr>
1 1 test1_10 72 shipments
2 1 test2_11 62 shipments
3 1 test3_12 17 shipments
4 1 check1_20 89 arrivals
5 1 check2_21 54 arrivals
6 1 sound1_30 39 departures
7 1 sound2_31 94 departures
8 1 sound3_32 95 departures
9 2 test1_10 77 shipments
10 2 test2_11 4 shipments
# … with 150 more rows
如果您事先准备好 查找 table,则不需要任何额外的库,但是 dplyr
和 tidyr
:
lookup <- sapply(
names(dummylist),
\(nm) { setNames(rep(nm, length(dummylist[[nm]])), dummylist[[nm]]) }
) |>
setNames(nm = NULL) |>
unlist()
lookup
# test1_ test2_ test3_ check1_ check2_ sound1_ sound2_ sound3_
# "shipments" "shipments" "shipments" "arrivals" "arrivals" "departures" "departures" "departures"
现在您只需 gsub
即时翻译您的 part
,在通常的 mutate()
动词中:
dummy_data |>
pivot_longer(-id, names_to = 'part', values_to = 'number') |>
mutate(group = lookup[gsub('^(\w+_).*$', '\1', part)])
# # A tibble: 160 × 4
# id part number group
# <int> <chr> <int> <chr>
# 1 1 test1_10 91 shipments
# 2 1 test2_11 74 shipments
# 3 1 test3_12 46 shipments
# 4 1 check1_20 62 arrivals
# 5 1 check2_21 7 arrivals
# 6 1 sound1_30 35 departures
# 7 1 sound2_31 23 departures
# 8 1 sound3_32 84 departures
# 9 2 test1_10 59 shipments
# 10 2 test2_11 73 shipments
# # … with 150 more rows
我有一个数据框,我想 gather
使其为高格式,然后 mutate
在另一列上,其值基于列表中另一列的字符串的成员资格列表。例如,我有以下数据框和列表列表:
dummy_data <- data.frame("id" = 1:20,"test1_10" = sample(1:100, 20),"test2_11" = sample(1:100, 20),
"test3_12" = sample(1:100, 20),"check1_20" = sample(1:100, 20),
"check2_21" = sample(1:100, 20),"sound1_30" = sample(1:100, 20),
"sound2_31" = sample(1:100, 20),"sound3_32" = sample(1:100, 20))
dummylist <- list(c('test1_','test2_','test3_'),c('check1_','check2_'),c('sound1_','sound2_','sound3_'))
names(dummylist) <- c('shipments','arrivals','departures')
然后我像这样收集数据框:
dummy_data <- dummy_data %>%
gather("part", "number", 2:ncol(.))
我想要做的是添加一个列,该列具有在 dummylist
中找到的列表的名称,其中部分列中下划线之前的字符串是一个成员。我可以这样做:
dummydata <- dummydata %>%
mutate(Group = case_when(
str_extract(part,'.*_') %in% dummylist[[1]] ~ names(dummylist[1]),
str_extract(part,'.*_') %in% dummylist[[2]] ~ names(dummylist[2]),
str_extract(part,'.*_') %in% dummylist[[3]] ~ names(dummylist[3])
))
但是,dummylist
中的每个 list/group 需要一个单独的 str_extract
行。而我的真实数据远不止 3 lists/groups。所以我想知道是否有更有效的方法来执行此变异步骤以获取列表的名称?
非常感谢任何帮助,谢谢!
将 'dummylist' 转换为两列数据集后,使用 regex_left_join
可能更容易
library(fuzzyjoin)
library(dplyr)
library(tidyr)
library(tibble)
dummy_data %>%
# // reshape to long format - pivot_longer instead of gather
pivot_longer(cols = -id, names_to = 'part', values_to = 'number') %>%
# // join with the tibble/data.frame converted dummylist
regex_left_join(dummylist %>%
enframe(name = 'Group', value = 'part') %>%
unnest(part)) %>%
rename(part = part.x) %>%
select(-part.y)
-输出
# A tibble: 160 × 4
id part number Group
<int> <chr> <int> <chr>
1 1 test1_10 72 shipments
2 1 test2_11 62 shipments
3 1 test3_12 17 shipments
4 1 check1_20 89 arrivals
5 1 check2_21 54 arrivals
6 1 sound1_30 39 departures
7 1 sound2_31 94 departures
8 1 sound3_32 95 departures
9 2 test1_10 77 shipments
10 2 test2_11 4 shipments
# … with 150 more rows
如果您事先准备好 查找 table,则不需要任何额外的库,但是 dplyr
和 tidyr
:
lookup <- sapply(
names(dummylist),
\(nm) { setNames(rep(nm, length(dummylist[[nm]])), dummylist[[nm]]) }
) |>
setNames(nm = NULL) |>
unlist()
lookup
# test1_ test2_ test3_ check1_ check2_ sound1_ sound2_ sound3_
# "shipments" "shipments" "shipments" "arrivals" "arrivals" "departures" "departures" "departures"
现在您只需 gsub
即时翻译您的 part
,在通常的 mutate()
动词中:
dummy_data |>
pivot_longer(-id, names_to = 'part', values_to = 'number') |>
mutate(group = lookup[gsub('^(\w+_).*$', '\1', part)])
# # A tibble: 160 × 4
# id part number group
# <int> <chr> <int> <chr>
# 1 1 test1_10 91 shipments
# 2 1 test2_11 74 shipments
# 3 1 test3_12 46 shipments
# 4 1 check1_20 62 arrivals
# 5 1 check2_21 7 arrivals
# 6 1 sound1_30 35 departures
# 7 1 sound2_31 23 departures
# 8 1 sound3_32 84 departures
# 9 2 test1_10 59 shipments
# 10 2 test2_11 73 shipments
# # … with 150 more rows