基于映射到值向量合并列值

Coalesce column values based on mapping to a vector of values

如果我有以下两个对象:

> set.seed(100)
> lookup <- sample(1:3, 20, replace=T)
> lookup
[1] 2 3 2 3 1 2 2 3 2 2 3 2 2 3 3 3 3 2 1 3

> tb <- tibble(A=runif(20,0,1), B=runif(20,0,1), C= runif(20,0,1))
> tb

> tb
# A tibble: 20 × 3
       A     B      C
   <dbl> <dbl>  <dbl>
 1 0.770 0.780 0.456 
 2 0.882 0.884 0.445 
 3 0.549 0.208 0.245 
 4 0.278 0.307 0.694 
 5 0.488 0.331 0.412 
 6 0.929 0.199 0.328 
 7 0.349 0.236 0.573 
 8 0.954 0.275 0.967 
 9 0.695 0.591 0.662 
10 0.889 0.253 0.625 
11 0.180 0.123 0.857 
12 0.629 0.230 0.775 
13 0.990 0.598 0.834 
14 0.130 0.211 0.0915
15 0.331 0.464 0.460 
16 0.865 0.647 0.599 
17 0.778 0.961 0.920 
18 0.827 0.676 0.983 
19 0.603 0.445 0.0378
20 0.491 0.358 0.578

如何使用lookuptb中select对应row/column的值?

所以我最终应该得到一个与 lookup 大小相同的一维向量。它看起来像这样:

> new data
> [1] 0.780 0.445 0.208 0.694 0.488 ... 0.578

谢谢!

data.frame(但 tibbledata.table 不支持矩阵索引,因此对于此数据,

set.seed(42)
lookup <- sample(1:3, 20, replace=T)
lookup
#  [1] 1 1 1 1 2 2 2 1 3 3 1 1 2 2 2 3 3 1 1 3
tb <- tibble(A=runif(20,0,1), B=runif(20,0,1), C= runif(20,0,1))
head(tb)
# # A tibble: 6 x 3
#       A     B      C
#   <dbl> <dbl>  <dbl>
# 1 0.514 0.958 0.189 
# 2 0.390 0.888 0.271 
# 3 0.906 0.640 0.828 
# 4 0.447 0.971 0.693 
# 5 0.836 0.619 0.241 
# 6 0.738 0.333 0.0430

我们可以做到

as.data.frame(tb)[cbind(seq_along(lookup), lookup)]
#  [1] 0.514211784 0.390203467 0.905738131 0.446969628 0.618838207 0.333427211 0.346748248 0.388108283 0.479398564
# [10] 0.197410342 0.832916080 0.007334147 0.171264330 0.261087964 0.514412935 0.581604003 0.157905208 0.037431033
# [19] 0.973539914 0.775823363

不用as.data.frame可以做一个效率较低的方法:

mapply(`[[`, list(tb), seq_along(lookup), lookup)
#  [1] 0.514211784 0.390203467 0.905738131 0.446969628 0.618838207 0.333427211 0.346748248 0.388108283 0.479398564
# [10] 0.197410342 0.832916080 0.007334147 0.171264330 0.261087964 0.514412935 0.581604003 0.157905208 0.037431033
# [19] 0.973539914 0.775823363
## also works with `list(as.data.table(tb))`

虽然它确实在性能上受到了很大的打击(不足为奇):

bench::mark(
  sindri_baldur1 = unlist(tb, use.names = FALSE)[seq_along(lookup) + (lookup - 1L)*nrow(tb)], 
  sindri_baldur2 = unlist(tb)[seq_along(lookup) + (lookup - 1L)*nrow(tb)], 
  base = as.data.frame(tb)[cbind(seq_along(lookup), lookup)], 
  mapply = mapply(`[[`, list(tb), seq_along(lookup), lookup), 
  paulsmith2 = {
tb %>% 
  mutate(lookup = lookup) %>% 
  rowwise %>% 
  mutate(new = c_across(A:C)[lookup]) %>% 
  pull(new)
},
  check = FALSE)
# # A tibble: 5 x 13
#   expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory  time   gc    
#   <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>  <list> <list>
# 1 sindri_baldur1    4.5us    5.3us   159430.      736B    15.9   9999     1     62.7ms <NULL> <Rprof~ <benc~ <tibb~
# 2 sindri_baldur2   13.2us   14.7us    56723.    1.44KB     0    10000     0    176.3ms <NULL> <Rprof~ <benc~ <tibb~
# 3 base             78.3us   91.6us     7334.      944B     8.59  3414     4    465.5ms <NULL> <Rprof~ <benc~ <tibb~
# 4 mapply          612.4us 779.45us      942.      720B     6.39   442     3    469.4ms <NULL> <Rprof~ <benc~ <tibb~
# 5 paulsmith2       4.37ms   5.85ms      147.    20.3KB     6.51    68     3    461.1ms <NULL> <Rprof~ <benc~ <tibb~

(我必须使用 check=FALSE 来处理 sindri_baldur2 中引入的名称,否则所有结果在数值上都是相同的。)

基础 R 解决方案:

tb$lookup <- lookup
tb$new <- apply(tb, 1, function(x) x[x[4]])
new <- tb$new
new

#>  [1] 0.78035851 0.44541398 0.20771390 0.69435071 0.48830599 0.19867907
#>  [7] 0.23569430 0.96699908 0.59132105 0.25339065 0.85665304 0.22990589
#> [13] 0.59757529 0.09151028 0.45952549 0.59939816 0.91972191 0.67639817
#> [19] 0.60332436 0.57793740

另一种可能的解决方案,基于tidyverse

library(tidyverse)

set.seed(100)

lookup <- sample(1:3, 20, replace=T)
tb <- tibble(A=runif(20,0,1), B=runif(20,0,1), C= runif(20,0,1))

tb %>% 
  mutate(lookup = lookup) %>% 
  rowwise %>% 
  mutate(new = c_across(A:C)[lookup]) %>% 
  pull(new)

#>  [1] 0.78035851 0.44541398 0.20771390 0.69435071 0.48830599 0.19867907
#>  [7] 0.23569430 0.96699908 0.59132105 0.25339065 0.85665304 0.22990589
#> [13] 0.59757529 0.09151028 0.45952549 0.59939816 0.91972191 0.67639817
#> [19] 0.60332436 0.57793740

你可以:

unlist(tb, use.names = FALSE)[seq_along(lookup) + (lookup - 1L)*nrow(tb)]

#  [1] 0.78035851 0.44541398 0.20771390 0.69435071 0.48830599 0.19867907 0.23569430 0.96699908 0.59132105
# [10] 0.25339065 0.85665304 0.22990589 0.59757529 0.09151028 0.45952549 0.59939816 0.91972191 0.67639817
# [19] 0.60332436 0.57793740

您也可以use.names并跟踪原始位置:

unlist(tb)[seq_along(lookup) + (lookup - 1L)*nrow(tb)] |> head()
#        B1        C2        B3        C4        A5        B6 
# 0.7803585 0.4454140 0.2077139 0.6943507 0.4883060 0.1986791