随机替换数据框中的值

Question

我正在使用 R 编程语言。假设我有以下数据框：

a = rnorm(100,10,1)
b = rnorm(100,10,5)
c = rnorm(100,10,10)
d = as.factor(sample( LETTERS[1:3], 100, replace=TRUE, prob=c(0.5, 0.3, 0.2) ))

my_data = data.frame(a,b,c,d)

 head(my_data)
          a         b          c d
1 12.433326 10.573004   2.586044 A
2  9.985524  8.903590  25.806358 C
3  9.538077 13.875609 -11.572231 C
4  9.342444  6.483715   4.056420 B
5  8.825197  8.633457   6.357470 A
6  9.121292  7.988194  15.999959 B

我的问题：对于任意行

其中“d = A”，我想用 0 20% 的时间随机替换列“a”，用 0 30% 的时间替换列“b”和列“c” 0 50% 的时间
其中“d = B”，我想用 0 50% 的时间随机替换列“a”，用 0 60% 的时间替换列“b”和列“c” 0 50% 的时间
其中“d = C”，我想用 0 20% 的时间随机替换列“a”，用 0 15% 的时间替换列“b”和列“c” 0 20% 的时间

我可以用非常低效的方式使用 base R 来做到这一点：

A <- my_data[which(my_data$d == "A"), ]
B <- my_data[which(my_data$d == "B"), ]
C <- my_data[which(my_data$d == "C"), ]

A$a_new <- sample( LETTERS[1:2], nrow(A), replace=TRUE, prob=c(0.2, 0.8) )
A$b_new <- sample( LETTERS[1:2], nrow(A), replace=TRUE, prob=c(0.3, 0.7) )
A$c_new <- sample( LETTERS[1:2], nrow(A), replace=TRUE, prob=c(0.5, 0.5) )

A$a_new2 = ifelse(A$a_new == "A", A$a, 0)
A$b_new2 = ifelse(A$b_new == "B", A$b, 0)
A$c_new2 = ifelse(A$b_new == "C", A$c, 0)

B$a_new <- sample( LETTERS[1:2], nrow(B), replace=TRUE, prob=c(0.5, 0.5) )
B$b_new <- sample( LETTERS[1:2], nrow(B), replace=TRUE, prob=c(0.6, 0.4) )
B$c_new <- sample( LETTERS[1:2], nrow(B), replace=TRUE, prob=c(0.5, 0.5) )

B$a_new2 = ifelse(B$a_new == "A", B$a, 0)
B$b_new2 = ifelse(B$b_new == "B", B$b, 0)
B$c_new2 = ifelse(B$b_new == "C", B$c, 0)

C$a_new <- sample( LETTERS[1:2], nrow(C), replace=TRUE, prob=c(0.2, 0.8) )
C$b_new <- sample( LETTERS[1:2], nrow(C), replace=TRUE, prob=c(0.15, 0.85) )
C$c_new <- sample( LETTERS[1:2], nrow(C), replace=TRUE, prob=c(0.8, 0.2) )

C$a_new2 = ifelse(C$a_new == "A", C$a, 0)
C$b_new2 = ifelse(C$b_new == "B", C$b, 0)
C$c_new2 = ifelse(C$b_new == "C", C$c, 0)

final = rbind(A,B,C)

head(final)
           a         b         c d a_new b_new c_new   a_new2    b_new2 c_new2
1  12.433326 10.573004  2.586044 A     A     B     B 12.43333 10.573004      0
5   8.825197  8.633457  6.357470 A     B     B     B  0.00000  8.633457      0
7   9.594164 10.600787 27.190108 A     B     A     B  0.00000  0.000000      0
10  8.441369  1.944389 11.250866 A     B     A     B  0.00000  0.000000      0
11  9.192280 13.970166 -2.829124 A     B     B     A  0.00000 13.970166      0
12  9.916996 12.970319  3.472191 A     B     A     A  0.00000  0.000000      0

有谁知道是否有更有效的方法来解决这个问题？也许可以使用 DPLYR 库和 mutate() 函数来完成？

谢谢！

Answer 1

这是一个使用 tidyverse（特别是 purrr 和 dplyr）的解决方案。我的策略：

定义d的每个值的替换概率。
根据d的值拆分数据。
使用purrr::map2()遍历每个子数据框和替换概率向量，并进行适当的替换。 注意：您必须确保所有内容的顺序一致 - 即 a、b 和 c 的顺序与替换概率，以及步骤 2 中拆分数据的顺序与步骤 1 中概率向量的顺序。
将结果合并回一个数据框中。

library(tidyverse)

replacement_probs = list(
  A = c(0.2, 0.4, 0.5),
  B = c(0.5, 0.6, 0.5),
  C = c(0.2, 0.15, 0.2)
)

updated_data = my_data %>% 
  split(.$d) %>% 
  map2(replacement_probs, function(sub_data, probs) {
    d = sub_data$d
    map2_df(
      select(sub_data, a, b, c),
      probs,
      function(x, p) {
        x[sample(length(x), length(x) * p)] = 0
        x
      }
    ) %>% 
      mutate(d = d)
  }) %>%
  reduce(bind_rows)

输出：

# A tibble: 1,000 x 4
       a     b     c d    
   <dbl> <dbl> <dbl> <fct>
 1  9.05  0     0    A    
 2 11.6   7.15  0    A    
 3  8.65  7.31  0    A    
 4  0     0     8.26 A    
 5  0     4.61  0    A    
 6 10.3   0    21.4  A    
 7 11.9  19.1   0    A    
 8  9.19  9.42  0    A    
 9  9.54 13.5   0    A    
10  9.34  7.08  0    A    
# ... with 990 more rows

我们可以通过 d 再次拆分数据并计算零的数量来验证我们的工作。请注意，我将样本大小更改为 1000，以表明它有效。

updated_data %>% 
  split(.$d) %>% 
  map(~ map_dbl(.x, ~ sum(.x == 0) / length(.x)))

# Output
$A
        a         b         c         d 
0.1983806 0.3987854 0.5000000 0.0000000 

$B
        a         b         c         d 
0.4983389 0.5980066 0.4983389 0.0000000 

$C
        a         b         c         d 
0.2000000 0.1463415 0.2000000 0.0000000

Answer 2

这很相似，但没有嵌套，也不需要 mapping。

可重现的随机数据：

set.seed(42)
my_data <- local({
  a = rnorm(100,10,1)
  b = rnorm(100,10,5)
  c = rnorm(100,10,10)
  d = as.factor(sample( LETTERS[1:3], 100, replace=TRUE, prob=c(0.5, 0.3, 0.2) ))
  data.frame(a,b,c,d)
})
head(my_data)
#           a         b          c d
# 1 11.370958 16.004827 -10.009292 A
# 2  9.435302 15.223755  13.337772 A
# 3 10.363128  4.983957  21.713251 B
# 4 10.632863 19.242410  30.595392 A
# 5 10.404268  6.666133  -3.768616 B
# 6  9.893875 10.527569  -1.508556 A

生成概率为per-column/per-group的另一帧：

otherframe <- data.frame(d = c("A", "B", "C"), prob_a = c(0.2, 0.5, 0.2), prob_b = c(0.3, 0.6, 0.15), prob_c = c(0.5, 0.5, 0.2))
otherframe
#   d prob_a prob_b prob_c
# 1 A    0.2   0.30    0.5
# 2 B    0.5   0.60    0.5
# 3 C    0.2   0.15    0.2

它的用途。首先，我将（重新）使用另一个答案中的函数：

# adapted from 
my_func2 <- function(x, prop, def = 0) replace(x, sample(length(x), size = ceiling(prop * length(x)), replace = FALSE), def)

out <- left_join(my_data, otherframe, by = "d") %>%
  group_by(d) %>%
  mutate(
    a = my_func2(a, prob_a[1]),
    b = my_func2(b, prob_b[1]),
    c = my_func2(c, prob_c[1])
  ) %>%
  ungroup() %>%
  select(-prob_a, -prob_b, -prob_c)
out
# # A tibble: 100 x 7
#        a     b     c d
#    <dbl> <dbl> <dbl> <chr>
#  1 11.4   0     0    A
#  2  9.44 15.2   0    A
#  3 10.4   4.98 21.7  B
#  4 10.6   0     0    A
#  5  0     0     0    B
#  6  0    10.5   0    A
#  7  0     7.89  2.94 B
#  8  9.91  9.39  0    A
#  9 12.0  10.9   3.54 A
# 10  9.94 10.6   0    B
# # ... with 90 more rows

并验证 0 的比例是否合理。

out %>%
  group_by(d) %>%
  summarize(across(a:c, ~ sum(abs(.) < 1e-12) / n()))
# # A tibble: 3 x 4
#   d         a     b     c
#   <chr> <dbl> <dbl> <dbl>
# 1 A     0.208 0.302 0.509
# 2 B     0.517 0.621 0.517
# 3 C     0.222 0.167 0.222

随机替换数据框中的值

Randomly Replacing Values in a Data Frame

r

data-manipulation

dplyr