在 R 中使用 pmap 函数
using pmap function in R
我是第一次尝试使用 pmap,但很难分配参数。这是我的测试数据集:
overall <- data.table(dependant = rep(c("SPS", "DEPENDANT", "EMP"), 3),
exposure = rnorm(9, 0, 1),
age = c(1,2,3,1,2,3,3,1,2),
gender = rep(c("F", "F", "M"), 3))
我最初是这样做的:
# spouse
SPS <- overall[dependant == "SPS", .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureSPS <- sum(SPS$exposure)
SPSnormalized <- SPS[, exposure := exposure/sumExposureSPS][, .(age, gender, exposure)]
# dependant
DEPENDENT <- overall[dependant == "DEPENDENT", .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureDEPENDENT <- sum(DEPENDENT$exposure)
DEPENDENTnormalized <- DEPENDENT[, exposure := exposure/sumExposureDEPENDENT][, .(age, gender, exposure)]
# employee
EMP <- overall[dependant == "EMP", .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureEMP <- sum(EMP$exposure)
EMPnormalized <- EMP[, exposure := exposure/sumExposureEMP][, .(age, gender, exposure)]
但这非常重复,实际上只是名称不同而执行的操作始终相同。因此我写了一个函数:
calculateSubset <- function(overall,
dependantCode){
subset <- overall[dependant == dependantCode, .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureSubset <- sum(subset$exposure)
subsetNormalized <- subset[, exposure := exposure/sumExposureSubset][, .(age, gender, exposure)]
return(subset)
}
所以我将其减少为:
SPSnormalized <- calculateSubset(overall = overall,
dependantCode = "SPS")
DEPENDENTnormalized <- calculateSubset(overall = overall,
dependantCode = "DEPENDENT")
EMPnormalized <- calculateSubset(overall = overall,
dependantCode = "EMP")
然而,这仍然是重复的。我似乎看到了一些使用 pmap
完全摆脱重复代码的例子。
如何将参数传递给 pmap,以便在最后获得所需的输出?
为简单起见,交换函数calculateSubset
中的参数。默认情况下,map
系列迭代一个列表作为函数的第一个参数传递。
calculateSubset <- function( dependantCode, df = overall){
subset <- df[dependant == dependantCode, .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureSubset <- sum(subset$exposure)
subsetNormalized <- subset[, exposure := exposure/sumExposureSubset][, .(age, gender, exposure)]
return(subset)
}
c("SPS", "DEPENDANT", "EMP") %>% map(calculateSubset)
# Note that the above map() call is equivalent but more concise than this pmap() call: list(c("SPS", "DEPENDENT", "EMP")) %>% pmap(calculateSubset)
[[1]]
dependant age gender exposure
1: SPS 1 F 0.522064
2: SPS 3 F 0.477936
[[2]]
dependant age gender exposure
1: DEPENDANT 2 F -0.3019417
2: DEPENDANT 1 F 1.3019417
[[3]]
dependant age gender exposure
1: EMP 3 M 0.8140009
2: EMP 2 M 0.1859991
您应该在 group_by
-
中包含 dependant
而不是子集化和取 sum
library(dplyr)
overall %>%
group_by(dependant, age, gender) %>%
summarise(exposure = sum(exposure), .groups = 'drop') %>%
mutate(exposure = prop.table(exposure)) %>%
ungroup
# dependant gender age exposure
# <chr> <chr> <dbl> <dbl>
#1 DEPENDANT F 1 0.971
#2 DEPENDANT F 2 0.0292
#3 EMP M 2 0.158
#4 EMP M 3 0.842
#5 SPS F 1 1.39
#6 SPS F 3 -0.388
在 summarise
之后,数据按 dependant
和 age
分组,因此使用 prop.table
将给出该组中的比率。
我是第一次尝试使用 pmap,但很难分配参数。这是我的测试数据集:
overall <- data.table(dependant = rep(c("SPS", "DEPENDANT", "EMP"), 3),
exposure = rnorm(9, 0, 1),
age = c(1,2,3,1,2,3,3,1,2),
gender = rep(c("F", "F", "M"), 3))
我最初是这样做的:
# spouse
SPS <- overall[dependant == "SPS", .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureSPS <- sum(SPS$exposure)
SPSnormalized <- SPS[, exposure := exposure/sumExposureSPS][, .(age, gender, exposure)]
# dependant
DEPENDENT <- overall[dependant == "DEPENDENT", .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureDEPENDENT <- sum(DEPENDENT$exposure)
DEPENDENTnormalized <- DEPENDENT[, exposure := exposure/sumExposureDEPENDENT][, .(age, gender, exposure)]
# employee
EMP <- overall[dependant == "EMP", .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureEMP <- sum(EMP$exposure)
EMPnormalized <- EMP[, exposure := exposure/sumExposureEMP][, .(age, gender, exposure)]
但这非常重复,实际上只是名称不同而执行的操作始终相同。因此我写了一个函数:
calculateSubset <- function(overall,
dependantCode){
subset <- overall[dependant == dependantCode, .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureSubset <- sum(subset$exposure)
subsetNormalized <- subset[, exposure := exposure/sumExposureSubset][, .(age, gender, exposure)]
return(subset)
}
所以我将其减少为:
SPSnormalized <- calculateSubset(overall = overall,
dependantCode = "SPS")
DEPENDENTnormalized <- calculateSubset(overall = overall,
dependantCode = "DEPENDENT")
EMPnormalized <- calculateSubset(overall = overall,
dependantCode = "EMP")
然而,这仍然是重复的。我似乎看到了一些使用 pmap
完全摆脱重复代码的例子。
如何将参数传递给 pmap,以便在最后获得所需的输出?
为简单起见,交换函数calculateSubset
中的参数。默认情况下,map
系列迭代一个列表作为函数的第一个参数传递。
calculateSubset <- function( dependantCode, df = overall){
subset <- df[dependant == dependantCode, .(exposure = sum(exposure)), by = c("dependant", "age", "gender")]
sumExposureSubset <- sum(subset$exposure)
subsetNormalized <- subset[, exposure := exposure/sumExposureSubset][, .(age, gender, exposure)]
return(subset)
}
c("SPS", "DEPENDANT", "EMP") %>% map(calculateSubset)
# Note that the above map() call is equivalent but more concise than this pmap() call: list(c("SPS", "DEPENDENT", "EMP")) %>% pmap(calculateSubset)
[[1]]
dependant age gender exposure
1: SPS 1 F 0.522064
2: SPS 3 F 0.477936
[[2]]
dependant age gender exposure
1: DEPENDANT 2 F -0.3019417
2: DEPENDANT 1 F 1.3019417
[[3]]
dependant age gender exposure
1: EMP 3 M 0.8140009
2: EMP 2 M 0.1859991
您应该在 group_by
-
dependant
而不是子集化和取 sum
library(dplyr)
overall %>%
group_by(dependant, age, gender) %>%
summarise(exposure = sum(exposure), .groups = 'drop') %>%
mutate(exposure = prop.table(exposure)) %>%
ungroup
# dependant gender age exposure
# <chr> <chr> <dbl> <dbl>
#1 DEPENDANT F 1 0.971
#2 DEPENDANT F 2 0.0292
#3 EMP M 2 0.158
#4 EMP M 3 0.842
#5 SPS F 1 1.39
#6 SPS F 3 -0.388
在 summarise
之后,数据按 dependant
和 age
分组,因此使用 prop.table
将给出该组中的比率。