r 中基于另一列的累计总和,不包括多列的当前值
Cumulative sum in r based on another column excluding the current value for more than one column
我想根据另一列计算一列的累计总和,下面的代码成功地做到了。但另外我需要排除当前元素。
library(data.table)
cat_var <- c("rock", "indie", "rock", "rock", "pop", "indie", "rock")
cat_var_2 <- c("blue", "green", "red", "red", "blue", "green", "blue")
target_var <- c(0, 0, 1, 1, 1, 1, 0)
df <- data.table("categorical_variable" = cat_var, "categorical_variable_2" = cat_var_2, "target_variable" = target_var)
ave(df[,"target_variable"], df[,c("categorical_variable")], FUN=cumsum)
现在我可以根据 categorical_variable
计算 target_variable
的累加和。我想在排除当前值的一段代码中计算 categorical_variable
和 categorical_variable_2
的累计和。像这样:
ave(df[,"target_variable"], df[,c("categorical_variable", "categorical_variable_2")], FUN=cumsum)
预期输出为:
categorical_variable_transformed <- c(0, 0, 0, 1, 0, 0, 2)
categorical_variable_2_transformed <- c(0, 0, 0, 1, 0, 0, 1)
df$categorical_variable_transformed <- categorical_variable_transformed
df$categorical_variable_2_transformed <- categorical_variable_2_transformed
使用 .SD
问题似乎很容易解决:
df[, target_variable := lapply(.SD, \(x) if(length(x) > 1L) sapply(seq_along(x), \(i) cumsum(x[-i])) else x),
by = c("categorical_variable", "categorical_variable_2")]
df
# categorical_variable categorical_variable_2 target_variable
#1: rock blue 0
#2: indie green 0
#3: rock red 1
#4: rock red 1
#5: pop blue 1
#6: indie green 1
#7: rock blue 0
试试这个:
library(data.table)
nms <- c("categorical_variable", "categorical_variable_2")
df[, paste0(nms, "_transformed") :=
lapply(nms, \(g) ave(target_variable, get(g), FUN = cumsum) - target_variable)]
df
# categorical_variable categorical_variable_2 target_variable categorical_variable_transformed categorical_variable_2_transformed
# <char> <char> <num> <num> <num>
# 1: rock blue 0 0 0
# 2: indie green 0 0 0
# 3: rock red 1 0 0
# 4: rock red 1 1 1
# 5: pop blue 1 0 0
# 6: indie green 1 0 0
# 7: rock blue 0 2 1
我们可以使用 data.table
方法,因为它是 data.table
nm1 <- grep("categorical", names(df), value = TRUE)
nm2 <- paste0(nm1, "_transformed")
for(i in seq_along(nm1))
df[, (nm2)[i] := cumsum(target_variable) - target_variable, by = c(nm1[i])]
-输出
> df
categorical_variable categorical_variable_2 target_variable categorical_variable_transformed categorical_variable_2_transformed
1: rock blue 0 0 0
2: indie green 0 0 0
3: rock red 1 0 0
4: rock red 1 1 1
5: pop blue 1 0 0
6: indie green 1 0 0
7: rock blue 0 2 1
我想根据另一列计算一列的累计总和,下面的代码成功地做到了。但另外我需要排除当前元素。
library(data.table)
cat_var <- c("rock", "indie", "rock", "rock", "pop", "indie", "rock")
cat_var_2 <- c("blue", "green", "red", "red", "blue", "green", "blue")
target_var <- c(0, 0, 1, 1, 1, 1, 0)
df <- data.table("categorical_variable" = cat_var, "categorical_variable_2" = cat_var_2, "target_variable" = target_var)
ave(df[,"target_variable"], df[,c("categorical_variable")], FUN=cumsum)
现在我可以根据 categorical_variable
计算 target_variable
的累加和。我想在排除当前值的一段代码中计算 categorical_variable
和 categorical_variable_2
的累计和。像这样:
ave(df[,"target_variable"], df[,c("categorical_variable", "categorical_variable_2")], FUN=cumsum)
预期输出为:
categorical_variable_transformed <- c(0, 0, 0, 1, 0, 0, 2)
categorical_variable_2_transformed <- c(0, 0, 0, 1, 0, 0, 1)
df$categorical_variable_transformed <- categorical_variable_transformed
df$categorical_variable_2_transformed <- categorical_variable_2_transformed
使用 .SD
问题似乎很容易解决:
df[, target_variable := lapply(.SD, \(x) if(length(x) > 1L) sapply(seq_along(x), \(i) cumsum(x[-i])) else x),
by = c("categorical_variable", "categorical_variable_2")]
df
# categorical_variable categorical_variable_2 target_variable
#1: rock blue 0
#2: indie green 0
#3: rock red 1
#4: rock red 1
#5: pop blue 1
#6: indie green 1
#7: rock blue 0
试试这个:
library(data.table)
nms <- c("categorical_variable", "categorical_variable_2")
df[, paste0(nms, "_transformed") :=
lapply(nms, \(g) ave(target_variable, get(g), FUN = cumsum) - target_variable)]
df
# categorical_variable categorical_variable_2 target_variable categorical_variable_transformed categorical_variable_2_transformed
# <char> <char> <num> <num> <num>
# 1: rock blue 0 0 0
# 2: indie green 0 0 0
# 3: rock red 1 0 0
# 4: rock red 1 1 1
# 5: pop blue 1 0 0
# 6: indie green 1 0 0
# 7: rock blue 0 2 1
我们可以使用 data.table
方法,因为它是 data.table
nm1 <- grep("categorical", names(df), value = TRUE)
nm2 <- paste0(nm1, "_transformed")
for(i in seq_along(nm1))
df[, (nm2)[i] := cumsum(target_variable) - target_variable, by = c(nm1[i])]
-输出
> df
categorical_variable categorical_variable_2 target_variable categorical_variable_transformed categorical_variable_2_transformed
1: rock blue 0 0 0
2: indie green 0 0 0
3: rock red 1 0 0
4: rock red 1 1 1
5: pop blue 1 0 0
6: indie green 1 0 0
7: rock blue 0 2 1