随机删除一行的一部分
Randomly Deleting Parts of a Row
我有以下数据框:
id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)
my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3,
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7,
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)
head(my_data)
id weight_year_1 state_year_1 weight_year_2 state_year_2 weight_year_3 state_year_3 weight_year_4 state_year_4 weight_year_5 state_year_5 weight_year_6 state_year_6 weight_year_7 state_year_7 weight_year_8 state_year_8
1 1 119.3852 2 111.30729 5 99.11912 5 97.06366 1 103.73559 4 100.53940 3 90.98888 2 95.10628 3
2 2 124.5046 3 86.74208 4 96.87224 3 88.84019 2 92.39560 4 96.83324 3 108.60610 1 90.24227 3
3 3 98.3621 2 114.60002 1 91.61257 3 121.88707 2 103.78418 2 96.77586 2 103.58945 3 102.08050 3
4 4 102.8222 3 95.72920 5 92.51412 4 107.94097 4 105.07041 3 116.22625 1 100.52621 5 102.88718 1
5 5 114.0140 5 94.04442 2 112.10150 2 111.40825 4 90.93852 4 83.81637 3 118.08578 5 84.64170 3
6 6 113.0468 2 96.90621 1 102.99961 4 89.28867 1 107.19814 2 99.29141 1 79.91099 1 106.01940 1
weight_year_9 state_year_9 weight_year_10 state_year_10
1 105.34245 5 106.61219 4
2 93.87486 4 95.14339 1
3 99.22730 1 108.46509 4
4 88.78866 1 114.68032 5
5 93.28602 5 91.50742 1
6 104.14194 4 98.67597 2
我想从左边随机删除“每行的一部分”直到某列 - 这应该看起来像这样(“红线”指的是已删除的条目,例如替换为 NA):
我想到了以下方法:
第 1 步:首先,随机select哪些 ID 有资格进行删除
#1 = delete, 2 = no delete
id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)
第 2 步: 对于要删除的 select id,随机选择要删除的列数(例如,排除“id”列,2 = 删除前 2 列,4 = 删除前 4 列,等等)
col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL
最后,我有这样的东西:
id final_number_of_col_delete
1 1 NONE
2 2 NONE
3 3 14
4 4 14
5 5 12
6 6 NONE
基于此文件 (deleted_ids),来自“my_data” 我想:
- 从对应于 id = 1 的行中删除任何内容
- 从对应于 id = 2 的行中删除任何内容
- 删除id=3对应行的前14列(不包括id列)
- 删除id=4对应行的前14列(不包括id列)
- 删除id=5对应行的前12列(不包括id列)
- 从对应于 id = 6 的行中删除任何内容
- 等等
有人可以告诉我怎么做吗?
注:这里的“删除”意思是“用NA替换条目”。
由于所有组、权重和状态具有相同的特征,只需在 matrix
.
中一次绘制所有 rnorm
和 sample
然后尝试mapply
随机删除,使用TRUE/FALSE
,后续ID和列向量作为输入。 <<-
更改全局环境中的对象。
最终转换为as.data.frame
和setNames
。
n <- 100
set.seed(42)
M <- matrix(c(seq_len(n), rnorm(10*n, 100, 10),
sample.int(5, 10*n, replace = TRUE)),
n, 1 + 10 + 10)
invisible(mapply(\(x, y, z) if (sample(x, 1)) {s <- sample(z, 1); M[y, c(s, s + 10)] <<- NA},
list(c(TRUE, FALSE)), seq_len(n), list(2:10)))
dat <- as.data.frame(M) |>
setNames(c('id', paste0('weight_time_', 1:10), paste0('state_time_', 1:10)))
看起来像这样:
head(dat)
# id weight_time_1 weight_time_2 weight_time_3 weight_time_4 weight_time_5 weight_time_6 weight_time_7
# 1 1 113.70958 112.00965 79.99071 99.95379 113.34913 110.29141 97.51517
# 2 2 94.35302 110.44751 103.33777 107.60242 91.30728 109.14775 NA
# 3 3 103.63128 89.96791 111.71325 100.38991 NA 99.97544 109.87653
# 4 4 106.32863 118.48482 120.59539 107.35072 NA 101.36010 108.35568
# 5 5 104.04268 93.33227 86.23138 98.53527 94.21644 92.79846 93.39478
# 6 6 98.93875 101.05514 88.49144 NA 90.01261 98.01876 115.64069
# weight_time_8 weight_time_9 weight_time_10 state_time_1 state_time_2 state_time_3 state_time_4 state_time_5
# 1 102.94692 106.88808 109.41924 2 4 5 3 2
# 2 103.92741 107.25083 97.51386 5 5 3 5 4
# 3 89.99156 102.17380 100.96479 3 5 4 5 NA
# 4 96.74273 97.98343 95.66069 4 2 1 1 NA
# 5 89.91651 NA 121.78668 4 3 4 1 1
# 6 93.64569 96.91062 70.41220 4 1 5 NA 4
# state_time_6 state_time_7 state_time_8 state_time_9 state_time_10
# 1 5 4 2 3 5
# 2 5 NA 2 3 4
# 3 4 1 5 1 1
# 4 5 4 5 3 4
# 5 4 4 4 NA 3
# 6 4 2 5 4 1
opts <- c(NA, NA, NA, NA, 2,4,6,8,10, 12, 14, 16, 18)
del_to <- sample(opts, nrow(df))
for(i in 1:nrow(df)){
if(is.na(del_to[i]) == FALSE)
df[i, 1:del_to[i]] <- NA
}
我有以下数据框:
id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)
my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3,
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7,
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)
head(my_data)
id weight_year_1 state_year_1 weight_year_2 state_year_2 weight_year_3 state_year_3 weight_year_4 state_year_4 weight_year_5 state_year_5 weight_year_6 state_year_6 weight_year_7 state_year_7 weight_year_8 state_year_8
1 1 119.3852 2 111.30729 5 99.11912 5 97.06366 1 103.73559 4 100.53940 3 90.98888 2 95.10628 3
2 2 124.5046 3 86.74208 4 96.87224 3 88.84019 2 92.39560 4 96.83324 3 108.60610 1 90.24227 3
3 3 98.3621 2 114.60002 1 91.61257 3 121.88707 2 103.78418 2 96.77586 2 103.58945 3 102.08050 3
4 4 102.8222 3 95.72920 5 92.51412 4 107.94097 4 105.07041 3 116.22625 1 100.52621 5 102.88718 1
5 5 114.0140 5 94.04442 2 112.10150 2 111.40825 4 90.93852 4 83.81637 3 118.08578 5 84.64170 3
6 6 113.0468 2 96.90621 1 102.99961 4 89.28867 1 107.19814 2 99.29141 1 79.91099 1 106.01940 1
weight_year_9 state_year_9 weight_year_10 state_year_10
1 105.34245 5 106.61219 4
2 93.87486 4 95.14339 1
3 99.22730 1 108.46509 4
4 88.78866 1 114.68032 5
5 93.28602 5 91.50742 1
6 104.14194 4 98.67597 2
我想从左边随机删除“每行的一部分”直到某列 - 这应该看起来像这样(“红线”指的是已删除的条目,例如替换为 NA):
我想到了以下方法:
第 1 步:首先,随机select哪些 ID 有资格进行删除
#1 = delete, 2 = no delete
id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)
第 2 步: 对于要删除的 select id,随机选择要删除的列数(例如,排除“id”列,2 = 删除前 2 列,4 = 删除前 4 列,等等)
col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL
最后,我有这样的东西:
id final_number_of_col_delete
1 1 NONE
2 2 NONE
3 3 14
4 4 14
5 5 12
6 6 NONE
基于此文件 (deleted_ids),来自“my_data” 我想:
- 从对应于 id = 1 的行中删除任何内容
- 从对应于 id = 2 的行中删除任何内容
- 删除id=3对应行的前14列(不包括id列)
- 删除id=4对应行的前14列(不包括id列)
- 删除id=5对应行的前12列(不包括id列)
- 从对应于 id = 6 的行中删除任何内容
- 等等
有人可以告诉我怎么做吗?
注:这里的“删除”意思是“用NA替换条目”。
由于所有组、权重和状态具有相同的特征,只需在 matrix
.
rnorm
和 sample
然后尝试mapply
随机删除,使用TRUE/FALSE
,后续ID和列向量作为输入。 <<-
更改全局环境中的对象。
最终转换为as.data.frame
和setNames
。
n <- 100
set.seed(42)
M <- matrix(c(seq_len(n), rnorm(10*n, 100, 10),
sample.int(5, 10*n, replace = TRUE)),
n, 1 + 10 + 10)
invisible(mapply(\(x, y, z) if (sample(x, 1)) {s <- sample(z, 1); M[y, c(s, s + 10)] <<- NA},
list(c(TRUE, FALSE)), seq_len(n), list(2:10)))
dat <- as.data.frame(M) |>
setNames(c('id', paste0('weight_time_', 1:10), paste0('state_time_', 1:10)))
看起来像这样:
head(dat)
# id weight_time_1 weight_time_2 weight_time_3 weight_time_4 weight_time_5 weight_time_6 weight_time_7
# 1 1 113.70958 112.00965 79.99071 99.95379 113.34913 110.29141 97.51517
# 2 2 94.35302 110.44751 103.33777 107.60242 91.30728 109.14775 NA
# 3 3 103.63128 89.96791 111.71325 100.38991 NA 99.97544 109.87653
# 4 4 106.32863 118.48482 120.59539 107.35072 NA 101.36010 108.35568
# 5 5 104.04268 93.33227 86.23138 98.53527 94.21644 92.79846 93.39478
# 6 6 98.93875 101.05514 88.49144 NA 90.01261 98.01876 115.64069
# weight_time_8 weight_time_9 weight_time_10 state_time_1 state_time_2 state_time_3 state_time_4 state_time_5
# 1 102.94692 106.88808 109.41924 2 4 5 3 2
# 2 103.92741 107.25083 97.51386 5 5 3 5 4
# 3 89.99156 102.17380 100.96479 3 5 4 5 NA
# 4 96.74273 97.98343 95.66069 4 2 1 1 NA
# 5 89.91651 NA 121.78668 4 3 4 1 1
# 6 93.64569 96.91062 70.41220 4 1 5 NA 4
# state_time_6 state_time_7 state_time_8 state_time_9 state_time_10
# 1 5 4 2 3 5
# 2 5 NA 2 3 4
# 3 4 1 5 1 1
# 4 5 4 5 3 4
# 5 4 4 4 NA 3
# 6 4 2 5 4 1
opts <- c(NA, NA, NA, NA, 2,4,6,8,10, 12, 14, 16, 18)
del_to <- sample(opts, nrow(df))
for(i in 1:nrow(df)){
if(is.na(del_to[i]) == FALSE)
df[i, 1:del_to[i]] <- NA
}