R:有条件地删除行的一部分
R: Conditionally Deleting Parts of a Row
我正在使用 R 编程语言。
我有以下数据:
set.seed(123)
id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)
my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3,
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7,
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)
数据看起来像这样:
id weight_time_1 state_time_1 weight_time_2 state_time_2 weight_time_3 state_time_3 weight_time_4 state_time_4 weight_time_5 state_time_5 weight_time_6 state_time_6 weight_time_7 state_time_7 weight_time_8 state_time_8
1 1 94.39524 1 92.89593 5 121.98810 3 92.84758 4 99.26444 2 93.98107 2 110.74012 3 92.71781 2
2 2 97.69823 1 102.56884 4 113.12413 3 92.47311 2 88.31349 2 90.06301 1 99.72653 1 84.59558 3
3 3 115.58708 2 97.53308 1 97.34855 4 90.61461 2 93.65252 3 110.26785 3 99.66670 3 93.06905 1
4 4 100.70508 2 96.52457 3 105.43194 1 89.47487 4 99.71158 5 107.51061 5 84.83932 5 101.18849 4
5 5 101.29288 1 90.48381 3 95.85660 5 95.62840 2 106.70696 5 84.90833 5 107.90385 2 86.35291 4
6 6 117.15065 3 99.54972 5 95.23753 3 103.31179 2 83.49453 4 99.04853 3 97.89266 3 105.89983 2
weight_time_9 state_time_9 weight_time_10 state_time_10
1 103.5628 1 89.85886 4
2 93.4199 1 92.08686 2
3 108.5520 4 102.99594 2
4 111.5294 3 116.39052 1
5 102.7627 2 110.84617 1
6 101.4410 4 93.75433 3
我想从这个数据集中随机 select 一些行 - 然后,对于这些随机 selected 行:我想随机选择一对 (weight_time, state_time) 和 删除这对随机选择的右边的所有内容。 这看起来像这样:
我最近发现了一些代码,展示了如何删除这对随机选择的左边的所有内容:
#Step 1: Randomly select id's that are eligible for deletion
#1 = delete, 2 = no delete
id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)
#Step 2: For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)
col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL
# Step 3 (Not My Code): Delete Everything From the Left Side
for(i in 1:nrow(my_data)){
if(deleted_ids$final_number_of_col_delete[i]!="NONE"){
my_data[i,2:(deleted_ids$final_number_of_col_delete[i])]=NA #2 required to retain ID column but replaces all specified columns with NA
}
else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs
}
现在,我正在尝试调整此代码,以便它从右侧删除所有内容,而不是从左侧删除所有内容。
有人可以告诉我怎么做吗?
谢谢!
注意:“删除”是指用 NA 替换
我认为这会有所帮助:
id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)
#Step 2: For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)
col_delete = c(4,6,8,10, 12, 14, 16, 18 , 20)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL
# Step 3 (Not My Code): Delete Everything From the Left Side
for(i in 1:nrow(my_data)){
if(deleted_ids$final_number_of_col_delete[i]!="NONE"){
my_data[i,(deleted_ids$final_number_of_col_delete[i]):21]=NA #2 required to retain ID column but replaces all specified columns with NA
}
else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs
}
一个 tidyverse 解决方案
set.seed(1)
n_grp <- (ncol(my_data) - 1) / 2
n_rm <- sample(1:nrow(my_data), 1)
id_rm <- sample(1:nrow(my_data), n_rm, replace = FALSE) %>% sort()
my_data %>%
filter(id %in% id_rm) %>%
group_by(id) %>%
group_modify(function(df, ...) {
col_rm_n <- 2 * (sample(1:n_grp, 1) - 1) + 1
df %>%
mutate(across(col_rm_n:ncol(df), ~ NA))
}) %>%
ungroup() %>%
bind_rows(
my_data %>% filter(!id %in% id_rm)
) %>%
arrange(id)
另一个选项:
my_data |>
slice_sample(n = sample(1:10, 1)) |>
pivot_longer(-id) |>
group_by(id) |>
group_map(~ head(.x, sample(seq(2, 20, 2), 1)), .keep = TRUE) |>
bind_rows() |>
pivot_wider(names_from = "name", values_from = "value")
简化的基础 R 解决方案:
# generate `data`
set.seed(123)
wt <- replicate(10, rnorm(100, 100, 10))
dimnames(wt) <- list(1:100, paste0("weight_time_", 1:10))
st <- replicate(10, sample.int(5, 100, replace = TRUE))
dimnames(st) <- list(1:100, paste0("state_time_", 1:10))
data <- cbind(wt, st)[, rep(1:10, each = 2) + (0:1) * 10]
data <- cbind(id = 1:100, data)
# pick rows and start columns to set to NA
del <- sample(c(TRUE, FALSE), 100, replace = TRUE)
from_col <- sample(seq(2, 18, 2), 100, replace = TRUE)
# set selected indices to NA
for (i in which(del == TRUE))
data[i, from_col[i]:dim(data)[2]] <- NA
由 reprex package (v2.0.1)
于 2022-05-28 创建
我正在使用 R 编程语言。
我有以下数据:
set.seed(123)
id = 1:100
weight_time_1 = rnorm(100,100,10)
weight_time_2 = rnorm(100,100,10)
weight_time_3 = rnorm(100,100,10)
weight_time_4 = rnorm(100,100,10)
weight_time_5 = rnorm(100,100,10)
weight_time_6 = rnorm(100,100,10)
weight_time_7 = rnorm(100,100,10)
weight_time_8 = rnorm(100,100,10)
weight_time_9 = rnorm(100,100,10)
weight_time_10 = rnorm(100,100,10)
state_time_1 = sample.int(5, 100, replace = TRUE)
state_time_2 = sample.int(5, 100, replace = TRUE)
state_time_3 = sample.int(5, 100, replace = TRUE)
state_time_4 = sample.int(5, 100, replace = TRUE)
state_time_5 = sample.int(5, 100, replace = TRUE)
state_time_6 = sample.int(5, 100, replace = TRUE)
state_time_7 = sample.int(5, 100, replace = TRUE)
state_time_8 = sample.int(5, 100, replace = TRUE)
state_time_9 = sample.int(5, 100, replace = TRUE)
state_time_10 = sample.int(5, 100, replace = TRUE)
my_data = data.frame(id, weight_time_1, state_time_1, weight_time_2, state_time_2, weight_time_3, state_time_3,
weight_time_4, state_time_4, weight_time_5, state_time_5, weight_time_6, state_time_6, weight_time_7, state_time_7,
weight_time_8, state_time_8, weight_time_9, state_time_9, weight_time_10, state_time_10)
数据看起来像这样:
id weight_time_1 state_time_1 weight_time_2 state_time_2 weight_time_3 state_time_3 weight_time_4 state_time_4 weight_time_5 state_time_5 weight_time_6 state_time_6 weight_time_7 state_time_7 weight_time_8 state_time_8
1 1 94.39524 1 92.89593 5 121.98810 3 92.84758 4 99.26444 2 93.98107 2 110.74012 3 92.71781 2
2 2 97.69823 1 102.56884 4 113.12413 3 92.47311 2 88.31349 2 90.06301 1 99.72653 1 84.59558 3
3 3 115.58708 2 97.53308 1 97.34855 4 90.61461 2 93.65252 3 110.26785 3 99.66670 3 93.06905 1
4 4 100.70508 2 96.52457 3 105.43194 1 89.47487 4 99.71158 5 107.51061 5 84.83932 5 101.18849 4
5 5 101.29288 1 90.48381 3 95.85660 5 95.62840 2 106.70696 5 84.90833 5 107.90385 2 86.35291 4
6 6 117.15065 3 99.54972 5 95.23753 3 103.31179 2 83.49453 4 99.04853 3 97.89266 3 105.89983 2
weight_time_9 state_time_9 weight_time_10 state_time_10
1 103.5628 1 89.85886 4
2 93.4199 1 92.08686 2
3 108.5520 4 102.99594 2
4 111.5294 3 116.39052 1
5 102.7627 2 110.84617 1
6 101.4410 4 93.75433 3
我想从这个数据集中随机 select 一些行 - 然后,对于这些随机 selected 行:我想随机选择一对 (weight_time, state_time) 和 删除这对随机选择的右边的所有内容。 这看起来像这样:
我最近发现了一些代码,展示了如何删除这对随机选择的左边的所有内容:
#Step 1: Randomly select id's that are eligible for deletion
#1 = delete, 2 = no delete
id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)
#Step 2: For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)
col_delete = c(2,4,6,8,10, 12, 14, 16, 18)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL
# Step 3 (Not My Code): Delete Everything From the Left Side
for(i in 1:nrow(my_data)){
if(deleted_ids$final_number_of_col_delete[i]!="NONE"){
my_data[i,2:(deleted_ids$final_number_of_col_delete[i])]=NA #2 required to retain ID column but replaces all specified columns with NA
}
else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs
}
现在,我正在尝试调整此代码,以便它从右侧删除所有内容,而不是从左侧删除所有内容。
有人可以告诉我怎么做吗?
谢谢!
注意:“删除”是指用 NA 替换
我认为这会有所帮助:
id = 1:100
delete_or_not_delete = sample.int(2, 100, replace = TRUE)
deleted_ids = data.frame(id,delete_or_not_delete)
#Step 2: For id's that were selected to be deleted, randomly pick how many columns to be deleted (e.g. excluding the "id" column, 2 = first 2 columns deleted, 4 = first 4 columns deleted, etc.)
col_delete = c(4,6,8,10, 12, 14, 16, 18 , 20)
col_delete = sample(col_delete, 100, replace = TRUE)
deleted_ids$col_delete = col_delete
deleted_ids$final_number_of_col_delete = ifelse(deleted_ids$delete_or_not_delete == "1", deleted_ids$col_delete, "NONE")
deleted_ids$col_delete = NULL
deleted_ids$delete_or_not_delete = NULL
# Step 3 (Not My Code): Delete Everything From the Left Side
for(i in 1:nrow(my_data)){
if(deleted_ids$final_number_of_col_delete[i]!="NONE"){
my_data[i,(deleted_ids$final_number_of_col_delete[i]):21]=NA #2 required to retain ID column but replaces all specified columns with NA
}
else{my_data[i,]=my_data[i,]} #Keeps the other values as they are before adding NAs
}
一个 tidyverse 解决方案
set.seed(1)
n_grp <- (ncol(my_data) - 1) / 2
n_rm <- sample(1:nrow(my_data), 1)
id_rm <- sample(1:nrow(my_data), n_rm, replace = FALSE) %>% sort()
my_data %>%
filter(id %in% id_rm) %>%
group_by(id) %>%
group_modify(function(df, ...) {
col_rm_n <- 2 * (sample(1:n_grp, 1) - 1) + 1
df %>%
mutate(across(col_rm_n:ncol(df), ~ NA))
}) %>%
ungroup() %>%
bind_rows(
my_data %>% filter(!id %in% id_rm)
) %>%
arrange(id)
另一个选项:
my_data |>
slice_sample(n = sample(1:10, 1)) |>
pivot_longer(-id) |>
group_by(id) |>
group_map(~ head(.x, sample(seq(2, 20, 2), 1)), .keep = TRUE) |>
bind_rows() |>
pivot_wider(names_from = "name", values_from = "value")
简化的基础 R 解决方案:
# generate `data`
set.seed(123)
wt <- replicate(10, rnorm(100, 100, 10))
dimnames(wt) <- list(1:100, paste0("weight_time_", 1:10))
st <- replicate(10, sample.int(5, 100, replace = TRUE))
dimnames(st) <- list(1:100, paste0("state_time_", 1:10))
data <- cbind(wt, st)[, rep(1:10, each = 2) + (0:1) * 10]
data <- cbind(id = 1:100, data)
# pick rows and start columns to set to NA
del <- sample(c(TRUE, FALSE), 100, replace = TRUE)
from_col <- sample(seq(2, 18, 2), 100, replace = TRUE)
# set selected indices to NA
for (i in which(del == TRUE))
data[i, from_col[i]:dim(data)[2]] <- NA
由 reprex package (v2.0.1)
于 2022-05-28 创建