通过 data.frame 的每一行中的名称列表对加权平均值进行矢量化
Vectorize weighted mean by list of names in each row of a data.frame
我有两个数据框。我想根据 values
数据帧中的值计算 results
数据帧每一行的加权平均值。 results
的每一行都有两列列表。列表的每个可能组合都是 values
数据框中的一行。我正在使用下面的代码(两个选项)执行此操作,这可能比我试图解释它更清楚。我想知道的是我是否以及如何对其进行矢量化(我的原始结果数据框非常大)。
library(dplyr)
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.frame(a, f)
# > results
# a f
# 1 a, b, c p, q
# 2 a, b r
# 3 c s, t
av = c('a','b','c')
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av, fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)
# > values
# Var1 Var2 w x
# 1 a p 0.10710168 62.58004
# 2 b p 0.89175147 20.26853
# 3 c p 0.31489520 85.90532
# 4 a q 0.07263807 89.02293
# 5 b q 0.87090293 72.17195
# 6 c q 0.88818599 48.65717
# 7 a r 0.54076274 39.46479
# 8 b r 0.08678314 57.99200
# 9 c r 0.86298554 77.00845
# 10 a s 0.41778402 23.35626
# 11 b s 0.70227865 82.76310
# 12 c s 0.84415123 65.26321
# 13 a t 0.50651689 75.52230
# 14 b t 0.37850063 87.41811
# 15 c t 0.58515251 96.74228
# Option 1 with apply
calc_wa <- function(as, fs){
as <- unlist(strsplit(as, ", "))
fs <- unlist(strsplit(fs, ", "))
valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
return(wa_res)
}
results$res <- apply(results, 1, function(y) calc_wa(y['a'], y['f']))
# Option 2 with mutate
calc_wa2 <- function(as, fs){
as <- unlist(strsplit(as.character(as), ", "))
fs <- unlist(strsplit(as.character(fs), ", "))
valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
return(wa_res)
}
results <- results %>% rowwise() %>% mutate(res2= calc_wa2(a, f))
# > results
# Source: local data frame [3 x 4]
# Groups: <by row>
#
# # A tibble: 3 x 4
# a f res res2
# <fct> <fct> <dbl> <dbl>
# 1 a, b, c p, q 52.3 52.3
# 2 a, b r 42.0 42.0
# 3 c s, t 78.2 78.2
(恐怕我缺少一些基本命令,我也不知道如何 title/tag 这个问题 - 欢迎提出建议)
改用data.table
:
设置数据(做了一些细微的改动):
library(data.table)
set.seed(1) # added for reproducability
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.table(a, f) #slight change
# > results
# a f
# 1 a, b, c p, q
# 2 a, b r
# 3 c s, t
av = c('a','b','c')
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av = av, fv = fv) #slight change
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)
代码:
results[, rowID := 1:.N] # add ID
results_expand <- results[, expand.grid(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ","))), stringsAsFactors = FALSE), by = .(rowID)] # expand results
# Alternate: results_expand <- results[, CJ(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ",")))), by = .(rowID)] # expand results
results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv")) # merge to value table
results_expand <- results_expand[, .(wm = weighted.mean(x, w)), by = rowID] # calculate weight
results <- merge(results, results_expand, by = "rowID")
results
rowID a f wm
1: 1 a, b, c p, q 74.56427
2: 2 a, b r 45.37445
3: 3 c s, t 35.14175
这在 data.table 中使用了合并和分组功能,因此应该比任何循环选项都快。
@Chris 建议的相同程序,但使用 data.frame 而不是 data.table
library(dplyr);library(tidyr)
set.seed(1) # added for reproducability
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.frame(a, f)
av = c('a','b','c')
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av=av, fv=fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)
results$ID <- seq.int(nrow(results))
results_expand<- results %>%
group_by(ID) %>%
expand(as=trimws(unlist(strsplit(as.character(a), ","))), fs=trimws(unlist(strsplit(as.character(f), ","))))
results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv"))
results_expand <- results_expand %>% group_by(ID) %>% mutate(wm = weighted.mean(x, w))
results <- merge(results, results_expand, by = "ID")
results <- results %>% group_by(ID) %>% select(ID, a, f, wm)
results <- distinct(results)
我有两个数据框。我想根据 values
数据帧中的值计算 results
数据帧每一行的加权平均值。 results
的每一行都有两列列表。列表的每个可能组合都是 values
数据框中的一行。我正在使用下面的代码(两个选项)执行此操作,这可能比我试图解释它更清楚。我想知道的是我是否以及如何对其进行矢量化(我的原始结果数据框非常大)。
library(dplyr)
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.frame(a, f)
# > results
# a f
# 1 a, b, c p, q
# 2 a, b r
# 3 c s, t
av = c('a','b','c')
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av, fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)
# > values
# Var1 Var2 w x
# 1 a p 0.10710168 62.58004
# 2 b p 0.89175147 20.26853
# 3 c p 0.31489520 85.90532
# 4 a q 0.07263807 89.02293
# 5 b q 0.87090293 72.17195
# 6 c q 0.88818599 48.65717
# 7 a r 0.54076274 39.46479
# 8 b r 0.08678314 57.99200
# 9 c r 0.86298554 77.00845
# 10 a s 0.41778402 23.35626
# 11 b s 0.70227865 82.76310
# 12 c s 0.84415123 65.26321
# 13 a t 0.50651689 75.52230
# 14 b t 0.37850063 87.41811
# 15 c t 0.58515251 96.74228
# Option 1 with apply
calc_wa <- function(as, fs){
as <- unlist(strsplit(as, ", "))
fs <- unlist(strsplit(fs, ", "))
valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
return(wa_res)
}
results$res <- apply(results, 1, function(y) calc_wa(y['a'], y['f']))
# Option 2 with mutate
calc_wa2 <- function(as, fs){
as <- unlist(strsplit(as.character(as), ", "))
fs <- unlist(strsplit(as.character(fs), ", "))
valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
return(wa_res)
}
results <- results %>% rowwise() %>% mutate(res2= calc_wa2(a, f))
# > results
# Source: local data frame [3 x 4]
# Groups: <by row>
#
# # A tibble: 3 x 4
# a f res res2
# <fct> <fct> <dbl> <dbl>
# 1 a, b, c p, q 52.3 52.3
# 2 a, b r 42.0 42.0
# 3 c s, t 78.2 78.2
(恐怕我缺少一些基本命令,我也不知道如何 title/tag 这个问题 - 欢迎提出建议)
改用data.table
:
设置数据(做了一些细微的改动):
library(data.table)
set.seed(1) # added for reproducability
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.table(a, f) #slight change
# > results
# a f
# 1 a, b, c p, q
# 2 a, b r
# 3 c s, t
av = c('a','b','c')
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av = av, fv = fv) #slight change
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)
代码:
results[, rowID := 1:.N] # add ID
results_expand <- results[, expand.grid(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ","))), stringsAsFactors = FALSE), by = .(rowID)] # expand results
# Alternate: results_expand <- results[, CJ(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ",")))), by = .(rowID)] # expand results
results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv")) # merge to value table
results_expand <- results_expand[, .(wm = weighted.mean(x, w)), by = rowID] # calculate weight
results <- merge(results, results_expand, by = "rowID")
results
rowID a f wm
1: 1 a, b, c p, q 74.56427
2: 2 a, b r 45.37445
3: 3 c s, t 35.14175
这在 data.table 中使用了合并和分组功能,因此应该比任何循环选项都快。
@Chris 建议的相同程序,但使用 data.frame 而不是 data.table
library(dplyr);library(tidyr)
set.seed(1) # added for reproducability
a = c('a, b, c', 'a, b', 'c')
f = c('p, q', 'r', 's, t')
results <- data.frame(a, f)
av = c('a','b','c')
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av=av, fv=fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)
results$ID <- seq.int(nrow(results))
results_expand<- results %>%
group_by(ID) %>%
expand(as=trimws(unlist(strsplit(as.character(a), ","))), fs=trimws(unlist(strsplit(as.character(f), ","))))
results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv"))
results_expand <- results_expand %>% group_by(ID) %>% mutate(wm = weighted.mean(x, w))
results <- merge(results, results_expand, by = "ID")
results <- results %>% group_by(ID) %>% select(ID, a, f, wm)
results <- distinct(results)