随机删除一行的一部分

Randomly Deleting Parts of a Row

我有以下数据框:

id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)


my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3, 
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7, 
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)

head(my_data)
  id weight_year_1 state_year_1 weight_year_2 state_year_2 weight_year_3 state_year_3 weight_year_4 state_year_4 weight_year_5 state_year_5 weight_year_6 state_year_6 weight_year_7 state_year_7 weight_year_8 state_year_8
1  1      119.3852            2     111.30729            5      99.11912            5      97.06366            1     103.73559            4     100.53940            3      90.98888            2      95.10628            3
2  2      124.5046            3      86.74208            4      96.87224            3      88.84019            2      92.39560            4      96.83324            3     108.60610            1      90.24227            3
3  3       98.3621            2     114.60002            1      91.61257            3     121.88707            2     103.78418            2      96.77586            2     103.58945            3     102.08050            3
4  4      102.8222            3      95.72920            5      92.51412            4     107.94097            4     105.07041            3     116.22625            1     100.52621            5     102.88718            1
5  5      114.0140            5      94.04442            2     112.10150            2     111.40825            4      90.93852            4      83.81637            3     118.08578            5      84.64170            3
6  6      113.0468            2      96.90621            1     102.99961            4      89.28867            1     107.19814            2      99.29141            1      79.91099            1     106.01940            1
  weight_year_9 state_year_9 weight_year_10 state_year_10
1     105.34245            5      106.61219             4
2      93.87486            4       95.14339             1
3      99.22730            1      108.46509             4
4      88.78866            1      114.68032             5
5      93.28602            5       91.50742             1
6     104.14194            4       98.67597             2

我想从左边随机删除“每行的一部分”直到某列 - 这应该看起来像这样(“红线”指的是已删除的条目,例如替换为 NA):

我想到了以下方法:

第 1 步:首先,随机select哪些 ID 有资格进行删除

 #1 = delete, 2 = no delete
id = 1:100
 delete_or_not_delete = sample.int(2, 100, replace = TRUE)
 deleted_ids = data.frame(id,delete_or_not_delete)

第 2 步: 对于要删除的 select id,随机选择要删除的列数(例如,排除“id”列,2 = 删除前 2 列,4 = 删除前 4 列,等等)

col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL

最后,我有这样的东西:

  id final_number_of_col_delete
1  1                       NONE
2  2                       NONE
3  3                         14
4  4                         14
5  5                         12
6  6                       NONE

基于此文件 (deleted_ids),来自“my_data” 我想:

有人可以告诉我怎么做吗?

注:这里的“删除”意思是“用NA替换条目”。

由于所有组、权重和状态具有相同的特征,只需在 matrix.

中一次绘制所有 rnormsample

然后尝试mapply随机删除,使用TRUE/FALSE,后续ID和列向量作为输入。 <<- 更改全局环境中的对象。

最终转换为as.data.framesetNames

n <- 100
set.seed(42)
M <- matrix(c(seq_len(n), rnorm(10*n, 100, 10),
              sample.int(5, 10*n, replace = TRUE)),
            n, 1 + 10 + 10)
invisible(mapply(\(x, y, z) if (sample(x, 1)) {s <- sample(z, 1); M[y, c(s, s + 10)] <<- NA}, 
                 list(c(TRUE, FALSE)), seq_len(n), list(2:10)))
dat <- as.data.frame(M) |> 
  setNames(c('id', paste0('weight_time_', 1:10), paste0('state_time_', 1:10)))

看起来像这样:

head(dat)
#   id weight_time_1 weight_time_2 weight_time_3 weight_time_4 weight_time_5 weight_time_6 weight_time_7
# 1  1     113.70958     112.00965      79.99071      99.95379     113.34913     110.29141      97.51517
# 2  2      94.35302     110.44751     103.33777     107.60242      91.30728     109.14775            NA
# 3  3     103.63128      89.96791     111.71325     100.38991            NA      99.97544     109.87653
# 4  4     106.32863     118.48482     120.59539     107.35072            NA     101.36010     108.35568
# 5  5     104.04268      93.33227      86.23138      98.53527      94.21644      92.79846      93.39478
# 6  6      98.93875     101.05514      88.49144            NA      90.01261      98.01876     115.64069
#   weight_time_8 weight_time_9 weight_time_10 state_time_1 state_time_2 state_time_3 state_time_4 state_time_5
# 1     102.94692     106.88808      109.41924            2            4            5            3            2
# 2     103.92741     107.25083       97.51386            5            5            3            5            4
# 3      89.99156     102.17380      100.96479            3            5            4            5           NA
# 4      96.74273      97.98343       95.66069            4            2            1            1           NA
# 5      89.91651            NA      121.78668            4            3            4            1            1
# 6      93.64569      96.91062       70.41220            4            1            5           NA            4
#   state_time_6 state_time_7 state_time_8 state_time_9 state_time_10
# 1            5            4            2            3             5
# 2            5           NA            2            3             4
# 3            4            1            5            1             1
# 4            5            4            5            3             4
# 5            4            4            4           NA             3
# 6            4            2            5            4             1
   opts <- c(NA, NA, NA, NA, 2,4,6,8,10, 12, 14, 16, 18)
   del_to <- sample(opts, nrow(df))
   
   for(i in 1:nrow(df)){
     if(is.na(del_to[i]) == FALSE)
        df[i, 1:del_to[i]] <- NA
   }