r中多列的rowsum
rowsum for multiple columns in r
我可以通过 catVariables
中的分类列中的级别对 target
列求和。但是,我不想在 for 循环中执行此操作,而是想立即将其应用于所有分类列。 For 循环将使代码 运行 更长,并且以矢量化方式执行此操作会更快。
# Data
col1 <- c("L", "R", "R", "L", "R", "L", "R", "L")
col2 <- c("R", "R", "R", "L", "L", "R", "L", "R")
col3 <- c("L", "-", "L", "R", "-", "L", "R", "-")
target <- c(1, 0, 0, 1, 1, 0, 1, 0)
dat <- data.frame("col1" = col1, "col2" = col2, "col3" = col3, "target" = target)
dat[sapply(dat, is.character)] <- lapply(dat[sapply(dat, is.character)], as.factor)
catVariables <- names(Filter(is.factor, dat))
# test
col1 <- c("L", "R", "R", "L", "R", "L", "R", "L")
col2 <- c("R", "R", "R", "L", "L", "R", "L", "R")
col3 <- c("L", "-", "L", "R", "-", "L", "R", "-")
target <- c(1, 0, 0, 1, 1, 0, 1, 0)
test_dat <- data.frame("col1" = col1, "col2" = col2, "col3" = col3, "target" = target)
for (col in catVariables){
ratios <- rowsum(dat[["target"]], dat[[col]])/sum(dat[["target"]])
print(ratios)
dat[[col]] <- ratios[match(dat[[col]],names(ratios[,1]))]
test_dat[[col]] <- ratios[match(test_dat[[col]], names(ratios[,1]))]
}
我们可以在 dplyr
中使用 across
在多个列上执行 rowsum
library(dplyr)
dat %>%
mutate(across(all_of(catVariables),
~ {tmp <- rowsum(target, .x)/sum(target);
tmp[match(.x, row.names(tmp))]}))
-输出
col1 col2 col3 target
1 0.5 0.25 0.25 1
2 0.5 0.25 0.25 0
3 0.5 0.25 0.25 0
4 0.5 0.75 0.50 1
5 0.5 0.75 0.25 1
6 0.5 0.25 0.25 0
7 0.5 0.75 0.50 1
8 0.5 0.25 0.25 0
或使用 test_dat
/train 数据 ('dat'),一个选项是遍历 test_dat
,使用列名从 'dat' 中提取相应的列 ( cur_column()
)按组计算rowsum
,然后match
'test_dat'列值与输出的行名展开数据
test_dat %>%
mutate(across(all_of(catVariables),
~ {tmp <- rowsum(dat[["target"]], dat[[cur_column()]])/sum(dat[["target"]]);
tmp[match(.x, row.names(tmp))]}))
col1 col2 col3 target
1 0.5 0.25 0.25 1
2 0.5 0.25 0.25 0
3 0.5 0.25 0.25 0
4 0.5 0.75 0.50 1
5 0.5 0.75 0.25 1
6 0.5 0.25 0.25 0
7 0.5 0.75 0.50 1
8 0.5 0.25 0.25 0
我可以通过 catVariables
中的分类列中的级别对 target
列求和。但是,我不想在 for 循环中执行此操作,而是想立即将其应用于所有分类列。 For 循环将使代码 运行 更长,并且以矢量化方式执行此操作会更快。
# Data
col1 <- c("L", "R", "R", "L", "R", "L", "R", "L")
col2 <- c("R", "R", "R", "L", "L", "R", "L", "R")
col3 <- c("L", "-", "L", "R", "-", "L", "R", "-")
target <- c(1, 0, 0, 1, 1, 0, 1, 0)
dat <- data.frame("col1" = col1, "col2" = col2, "col3" = col3, "target" = target)
dat[sapply(dat, is.character)] <- lapply(dat[sapply(dat, is.character)], as.factor)
catVariables <- names(Filter(is.factor, dat))
# test
col1 <- c("L", "R", "R", "L", "R", "L", "R", "L")
col2 <- c("R", "R", "R", "L", "L", "R", "L", "R")
col3 <- c("L", "-", "L", "R", "-", "L", "R", "-")
target <- c(1, 0, 0, 1, 1, 0, 1, 0)
test_dat <- data.frame("col1" = col1, "col2" = col2, "col3" = col3, "target" = target)
for (col in catVariables){
ratios <- rowsum(dat[["target"]], dat[[col]])/sum(dat[["target"]])
print(ratios)
dat[[col]] <- ratios[match(dat[[col]],names(ratios[,1]))]
test_dat[[col]] <- ratios[match(test_dat[[col]], names(ratios[,1]))]
}
我们可以在 dplyr
中使用 across
在多个列上执行 rowsum
library(dplyr)
dat %>%
mutate(across(all_of(catVariables),
~ {tmp <- rowsum(target, .x)/sum(target);
tmp[match(.x, row.names(tmp))]}))
-输出
col1 col2 col3 target
1 0.5 0.25 0.25 1
2 0.5 0.25 0.25 0
3 0.5 0.25 0.25 0
4 0.5 0.75 0.50 1
5 0.5 0.75 0.25 1
6 0.5 0.25 0.25 0
7 0.5 0.75 0.50 1
8 0.5 0.25 0.25 0
或使用 test_dat
/train 数据 ('dat'),一个选项是遍历 test_dat
,使用列名从 'dat' 中提取相应的列 ( cur_column()
)按组计算rowsum
,然后match
'test_dat'列值与输出的行名展开数据
test_dat %>%
mutate(across(all_of(catVariables),
~ {tmp <- rowsum(dat[["target"]], dat[[cur_column()]])/sum(dat[["target"]]);
tmp[match(.x, row.names(tmp))]}))
col1 col2 col3 target
1 0.5 0.25 0.25 1
2 0.5 0.25 0.25 0
3 0.5 0.25 0.25 0
4 0.5 0.75 0.50 1
5 0.5 0.75 0.25 1
6 0.5 0.25 0.25 0
7 0.5 0.75 0.50 1
8 0.5 0.25 0.25 0