使用 R/tidyverse 中的模式向量搜索许多变量
Search many variables with a vector of patterns in R/tidyverse
我想在我的数据框 (data_df 中搜索许多变量 (key1 到 key30) ]) 用于任何模式(存储在向量“my_patterns”中)。
对于每个观察,结果将存储在 30 个虚拟 variables/columns(key1_match 到 key30_match)中, 1 表示“keyX”变量与“my_patterns”向量中的值之一匹配,而 0 不匹配。对于特定的观察。我只需要知道有一个匹配项,而不是哪个匹配项。
我如何在 R 中执行此操作并且最好使用 tidyverse 函数?
my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
data_df <- structure(list(id = c(1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1317, 11832,
1943, 1316, 8317, 13405, 12881, 12881, 12881, 12881, 12881, 12881,
12882, 12882, 12882, 12882, 12883, 12883, 12883), key1 = c("",
"", "", "", "DR029", "", "AF063", "UJD05", "JCF12", "", "AF021",
"DT022", "XS912", "UJC12", "UJD05", "JAH00", "UJD02", "DT016",
"DT016", "", "DV071", "DR029", "2154", "", "AJ079", "XV018",
"7462", "7460", "LEG10"), key2 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "JFF00", NA, "AF021", "DT022", "XS912", "UJC12", "UJD05",
"JAH00", "UJD05", "DT017", "DT017", NA, "DV022", "JDB10", NA,
NA, "AJ080", NA, NA, "7461", "LCA06"), key3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, "UJD02", NA, "AF021", "DT022", "ZV033", "UJC12",
"UJD05", "JAH00", "AF012", "DT019", "DT019", NA, "DV079", NA,
NA, NA, "DR029", NA, NA, "7469", NA), key4 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05",
"JAH00", "AG053", NA, "DT024", NA, "DV027", NA, NA, NA, "DT016",
NA, NA, "7280", NA), key5 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05", "JKB30",
"JAH01", NA, NA, NA, "DV064", NA, NA, NA, "UJD02", NA, NA, NA,
NA), key6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02",
"DT022", "DV071", "UJC12", "UJD05", "JKB30", "JCA55", NA, NA,
NA, "DV040", NA, NA, NA, NA, NA, NA, NA, NA), key7 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DT022", "DV071", "UJD05",
"JCA55", "JKB30", "UJD02", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), key8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"UJD02", "DV051", "DV071", "UJD05", "JCA55", "JKB30", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key9 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DV053", "DV071", "UJD05",
"JCA55", "JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), key10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02",
"DV055", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), key11 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071", "UJD05", "TPW99",
"JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
key12 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053",
"DV057", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key13 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071",
"JCA55", "AJ050", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key14 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AF053", "DV057", "DV071", "JCA55", "AJ050",
"JFB40", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key15 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053",
"DV057", "DV071", "JCA55", "AJ050", "JFB40", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key16 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071",
"JCA55", "AG040", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key17 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "AG040",
"JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key18 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AG009",
"DV057", "DV071", "JCA55", "AG040", "JFF23", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key19 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071",
"JCA55", "XS009", "JFF23", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key20 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "XS009",
"JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key21 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004",
"DV057", "DT016", "JCA55", "XS009", "JWA00", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key22 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071",
"JCA55", "UJD05", "JWA00", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key23 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AJ004", "DV057", "XS918", "JCA55", "UJD05",
"JWA00", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key24 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004",
"DV057", "DV071", "JCA55", "JCA55", "JWA00", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key25 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071",
"JCA55", "TPW99", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key26 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "DV057", "DV071", "JCA55", "AJ050", "QBB99",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key27 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071",
"JCA55", "AG040", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key28 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "DV057", "DV071", "JCA55", "XS009", "QBB99",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key29 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071",
"JCA55", NA, "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), key30 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "DV057", "DV071", "JCA55", NA, "QBB99", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11814L, 106482L, 17355L,
11807L, 74026L, 120903L, 116030L, 116031L, 116032L, 116033L,
116034L, 116035L, 116036L, 116037L, 116038L, 116039L, 116040L,
116041L, 116042L), class = "data.frame")
library(tidyverse)
my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
# any element -> OR regex
my_regex <- paste0(my_patterns, collapse = "|")
data_df %>%
as_tibble() %>%
pivot_longer(-id) %>%
transmute(
id,
name,
value = value %>% map_dbl(~ .x %>%
str_detect(my_regex) %>%
replace_na(0))
) %>%
distinct(id, name, .keep_all = TRUE) %>%
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 12 x 31
#> id key1 key2 key3 key4 key5 key6 key7 key8 key9 key10 key11 key12
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 0 0 0 0 0 0 0 0 0 0 0
#> 2 2 0 0 0 0 0 0 0 0 0 0 0 0
#> 3 3 0 0 0 0 0 0 0 0 0 0 0 0
#> 4 1317 1 1 1 1 1 1 1 1 1 1 0 0
#> 5 11832 1 1 1 1 1 1 1 0 1 0 0 0
#> 6 1943 0 0 0 0 0 0 0 0 0 0 0 0
#> 7 1316 1 1 1 1 1 1 1 1 1 1 1 1
#> 8 8317 1 1 1 1 1 1 1 1 1 0 0 0
#> 9 13405 0 0 0 0 0 0 0 0 0 0 0 0
#> 10 12881 1 1 1 1 1 1 1 0 0 0 0 0
#> 11 12882 0 0 0 0 0 0 0 0 0 0 0 0
#> 12 12883 0 0 0 0 0 0 0 0 0 0 0 0
#> # … with 18 more variables: key13 <dbl>, key14 <dbl>, key15 <dbl>, key16 <dbl>,
#> # key17 <dbl>, key18 <dbl>, key19 <dbl>, key20 <dbl>, key21 <dbl>,
#> # key22 <dbl>, key23 <dbl>, key24 <dbl>, key25 <dbl>, key26 <dbl>,
#> # key27 <dbl>, key28 <dbl>, key29 <dbl>, key30 <dbl>
由 reprex package (v2.0.1)
于 2021-12-01 创建
使用 dplyr,我们可以对 starts_with
'key' 的所有列使用 mutate
。如果任何模式匹配该值,value %in% my_pattern
输出 TRUE/FALSE。我们可以用 +(...)
强制转换为数字。最后,summarise
by id with max
library(dplyr)
data_df %>%
mutate(across(starts_with('key'), ~ +(.x %in% my_patterns))) %>%
group_by(id)%>%
summarise(across(starts_with('key'), max))
# A tibble: 12 × 31
id key1 key2 key3 key4 key5 key6 key7 key8 key9 key10 key11 key12 key13 key14 key15 key16 key17 key18 key19 key20 key21 key22
<dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 3 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 1316 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
5 1317 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
6 1943 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 8317 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1
8 11832 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
9 12881 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 12882 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 12883 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 13405 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# … with 8 more variables: key23 <int>, key24 <int>, key25 <int>, key26 <int>, key27 <int>, key28 <int>, key29 <int>, key30 <int>
或者我们可以通过在 summarise(across()
:
中包含所有转换来进一步简化它
library(dplyr)
data_df %>%
group_by(id) %>%
summarise(across(starts_with('key'), ~+(any(.x %in% my_patterns))))
另一种解决方案:
library(tidyr)
data_df %>%
pivot_wider(id,values_from = -id, values_fn=function(x) sum(x %in% my_patterns))
#> # A tibble: 12 × 31
#> id key1_ key2_ key3_ key4_ key5_ key6_ key7_ key8_ key9_ key10_ key11_
#> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 0 0 0 0 0 0 0 0 0 0 0
#> 2 2 0 0 0 0 0 0 0 0 0 0 0
#> 3 3 1 0 1 0 0 0 0 0 0 0 0
#> 4 1317 1 1 1 1 1 1 1 1 1 1 0
#> 5 11832 1 1 1 1 1 1 1 0 1 0 0
#> 6 1943 0 0 0 0 0 0 0 0 0 0 0
#> 7 1316 1 1 1 1 1 1 1 1 1 1 1
#> 8 8317 1 1 1 1 1 1 1 1 1 0 0
#> 9 13405 0 0 0 0 0 0 0 0 0 0 0
#> 10 12881 1 1 1 1 1 1 1 0 0 0 0
#> 11 12882 0 0 0 0 1 0 0 0 0 0 0
#> 12 12883 0 0 0 0 0 0 0 0 0 0 0
#> # … with 19 more variables: key12_ <int>, key13_ <int>, key14_ <int>,
#> # key15_ <int>, key16_ <int>, key17_ <int>, key18_ <int>, key19_ <int>,
#> # key20_ <int>, key21_ <int>, key22_ <int>, key23_ <int>, key24_ <int>,
#> # key25_ <int>, key26_ <int>, key27_ <int>, key28_ <int>, key29_ <int>,
#> # key30_ <int>
我想在我的数据框 (data_df 中搜索许多变量 (key1 到 key30) ]) 用于任何模式(存储在向量“my_patterns”中)。 对于每个观察,结果将存储在 30 个虚拟 variables/columns(key1_match 到 key30_match)中, 1 表示“keyX”变量与“my_patterns”向量中的值之一匹配,而 0 不匹配。对于特定的观察。我只需要知道有一个匹配项,而不是哪个匹配项。
我如何在 R 中执行此操作并且最好使用 tidyverse 函数?
my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
data_df <- structure(list(id = c(1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1317, 11832,
1943, 1316, 8317, 13405, 12881, 12881, 12881, 12881, 12881, 12881,
12882, 12882, 12882, 12882, 12883, 12883, 12883), key1 = c("",
"", "", "", "DR029", "", "AF063", "UJD05", "JCF12", "", "AF021",
"DT022", "XS912", "UJC12", "UJD05", "JAH00", "UJD02", "DT016",
"DT016", "", "DV071", "DR029", "2154", "", "AJ079", "XV018",
"7462", "7460", "LEG10"), key2 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "JFF00", NA, "AF021", "DT022", "XS912", "UJC12", "UJD05",
"JAH00", "UJD05", "DT017", "DT017", NA, "DV022", "JDB10", NA,
NA, "AJ080", NA, NA, "7461", "LCA06"), key3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, "UJD02", NA, "AF021", "DT022", "ZV033", "UJC12",
"UJD05", "JAH00", "AF012", "DT019", "DT019", NA, "DV079", NA,
NA, NA, "DR029", NA, NA, "7469", NA), key4 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05",
"JAH00", "AG053", NA, "DT024", NA, "DV027", NA, NA, NA, "DT016",
NA, NA, "7280", NA), key5 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05", "JKB30",
"JAH01", NA, NA, NA, "DV064", NA, NA, NA, "UJD02", NA, NA, NA,
NA), key6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02",
"DT022", "DV071", "UJC12", "UJD05", "JKB30", "JCA55", NA, NA,
NA, "DV040", NA, NA, NA, NA, NA, NA, NA, NA), key7 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DT022", "DV071", "UJD05",
"JCA55", "JKB30", "UJD02", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), key8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"UJD02", "DV051", "DV071", "UJD05", "JCA55", "JKB30", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key9 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DV053", "DV071", "UJD05",
"JCA55", "JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), key10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02",
"DV055", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), key11 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071", "UJD05", "TPW99",
"JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
key12 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053",
"DV057", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key13 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071",
"JCA55", "AJ050", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key14 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AF053", "DV057", "DV071", "JCA55", "AJ050",
"JFB40", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key15 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053",
"DV057", "DV071", "JCA55", "AJ050", "JFB40", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key16 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071",
"JCA55", "AG040", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key17 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "AG040",
"JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key18 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AG009",
"DV057", "DV071", "JCA55", "AG040", "JFF23", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key19 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071",
"JCA55", "XS009", "JFF23", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key20 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "XS009",
"JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key21 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004",
"DV057", "DT016", "JCA55", "XS009", "JWA00", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key22 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071",
"JCA55", "UJD05", "JWA00", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key23 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "AJ004", "DV057", "XS918", "JCA55", "UJD05",
"JWA00", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), key24 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004",
"DV057", "DV071", "JCA55", "JCA55", "JWA00", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key25 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071",
"JCA55", "TPW99", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key26 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "DV057", "DV071", "JCA55", "AJ050", "QBB99",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key27 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071",
"JCA55", "AG040", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), key28 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "DV057", "DV071", "JCA55", "XS009", "QBB99",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key29 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071",
"JCA55", NA, "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), key30 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "DV057", "DV071", "JCA55", NA, "QBB99", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11814L, 106482L, 17355L,
11807L, 74026L, 120903L, 116030L, 116031L, 116032L, 116033L,
116034L, 116035L, 116036L, 116037L, 116038L, 116039L, 116040L,
116041L, 116042L), class = "data.frame")
library(tidyverse)
my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
# any element -> OR regex
my_regex <- paste0(my_patterns, collapse = "|")
data_df %>%
as_tibble() %>%
pivot_longer(-id) %>%
transmute(
id,
name,
value = value %>% map_dbl(~ .x %>%
str_detect(my_regex) %>%
replace_na(0))
) %>%
distinct(id, name, .keep_all = TRUE) %>%
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 12 x 31
#> id key1 key2 key3 key4 key5 key6 key7 key8 key9 key10 key11 key12
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 0 0 0 0 0 0 0 0 0 0 0
#> 2 2 0 0 0 0 0 0 0 0 0 0 0 0
#> 3 3 0 0 0 0 0 0 0 0 0 0 0 0
#> 4 1317 1 1 1 1 1 1 1 1 1 1 0 0
#> 5 11832 1 1 1 1 1 1 1 0 1 0 0 0
#> 6 1943 0 0 0 0 0 0 0 0 0 0 0 0
#> 7 1316 1 1 1 1 1 1 1 1 1 1 1 1
#> 8 8317 1 1 1 1 1 1 1 1 1 0 0 0
#> 9 13405 0 0 0 0 0 0 0 0 0 0 0 0
#> 10 12881 1 1 1 1 1 1 1 0 0 0 0 0
#> 11 12882 0 0 0 0 0 0 0 0 0 0 0 0
#> 12 12883 0 0 0 0 0 0 0 0 0 0 0 0
#> # … with 18 more variables: key13 <dbl>, key14 <dbl>, key15 <dbl>, key16 <dbl>,
#> # key17 <dbl>, key18 <dbl>, key19 <dbl>, key20 <dbl>, key21 <dbl>,
#> # key22 <dbl>, key23 <dbl>, key24 <dbl>, key25 <dbl>, key26 <dbl>,
#> # key27 <dbl>, key28 <dbl>, key29 <dbl>, key30 <dbl>
由 reprex package (v2.0.1)
于 2021-12-01 创建使用 dplyr,我们可以对 starts_with
'key' 的所有列使用 mutate
。如果任何模式匹配该值,value %in% my_pattern
输出 TRUE/FALSE。我们可以用 +(...)
强制转换为数字。最后,summarise
by id with max
library(dplyr)
data_df %>%
mutate(across(starts_with('key'), ~ +(.x %in% my_patterns))) %>%
group_by(id)%>%
summarise(across(starts_with('key'), max))
# A tibble: 12 × 31
id key1 key2 key3 key4 key5 key6 key7 key8 key9 key10 key11 key12 key13 key14 key15 key16 key17 key18 key19 key20 key21 key22
<dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 3 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 1316 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
5 1317 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
6 1943 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 8317 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1
8 11832 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
9 12881 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 12882 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 12883 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 13405 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# … with 8 more variables: key23 <int>, key24 <int>, key25 <int>, key26 <int>, key27 <int>, key28 <int>, key29 <int>, key30 <int>
或者我们可以通过在 summarise(across()
:
library(dplyr)
data_df %>%
group_by(id) %>%
summarise(across(starts_with('key'), ~+(any(.x %in% my_patterns))))
另一种解决方案:
library(tidyr)
data_df %>%
pivot_wider(id,values_from = -id, values_fn=function(x) sum(x %in% my_patterns))
#> # A tibble: 12 × 31
#> id key1_ key2_ key3_ key4_ key5_ key6_ key7_ key8_ key9_ key10_ key11_
#> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 0 0 0 0 0 0 0 0 0 0 0
#> 2 2 0 0 0 0 0 0 0 0 0 0 0
#> 3 3 1 0 1 0 0 0 0 0 0 0 0
#> 4 1317 1 1 1 1 1 1 1 1 1 1 0
#> 5 11832 1 1 1 1 1 1 1 0 1 0 0
#> 6 1943 0 0 0 0 0 0 0 0 0 0 0
#> 7 1316 1 1 1 1 1 1 1 1 1 1 1
#> 8 8317 1 1 1 1 1 1 1 1 1 0 0
#> 9 13405 0 0 0 0 0 0 0 0 0 0 0
#> 10 12881 1 1 1 1 1 1 1 0 0 0 0
#> 11 12882 0 0 0 0 1 0 0 0 0 0 0
#> 12 12883 0 0 0 0 0 0 0 0 0 0 0
#> # … with 19 more variables: key12_ <int>, key13_ <int>, key14_ <int>,
#> # key15_ <int>, key16_ <int>, key17_ <int>, key18_ <int>, key19_ <int>,
#> # key20_ <int>, key21_ <int>, key22_ <int>, key23_ <int>, key24_ <int>,
#> # key25_ <int>, key26_ <int>, key27_ <int>, key28_ <int>, key29_ <int>,
#> # key30_ <int>