使用 R dplyr 根据多个条件找到最佳组合
Finding the best possible combinations based on multiple conditions with R dplyr
我的目标
我的目标是根据 指数 找到 10 位玩家 的最佳组合,其中 总分 95.5-100.4.
详情
有一个重要的细节。从10名球员中,根据role列应该有2C(两个中锋),4F(四个前锋),4*G (四名警卫)。
Atm 我正在为循环而苦苦挣扎,但我确信 dplyr 包中有一些我遗漏的亮点。非常感谢任何帮助或指导。
set.seed(123)
players <- paste("player",rep(1:20))
score <- runif(20, min=4, max=16.7)
index <- runif(20, min=-1, max=9)
role <- rep(c("C","F","F","G","G"),4)
df <- data.frame(players, score, index,role)
df
#> players score index role
#> 1 player 1 7.652235 7.8953932 C
#> 2 player 2 14.011475 5.9280341 F
#> 3 player 3 9.194007 5.4050681 F
#> 4 player 4 15.214321 8.9426978 G
#> 5 player 5 15.943935 5.5570580 G
#> 6 player 6 4.578568 6.0853047 C
#> 7 player 7 10.706940 4.4406602 F
#> 8 player 8 15.333722 4.9414202 F
#> 9 player 9 11.003225 1.8915974 G
#> 10 player 10 9.799007 0.4711365 G
#> 11 player 11 16.151783 8.6302423 C
#> 12 player 12 9.757344 8.0229905 F
#> 13 player 13 12.605147 5.9070528 F
#> 14 player 14 11.272444 6.9546742 G
#> 15 player 15 5.307143 -0.7538632 G
#> 16 player 16 15.427777 3.7779597 C
#> 17 player 17 7.125314 6.5845954 F
#> 18 player 18 4.534156 1.1640794 F
#> 19 player 19 8.164593 2.1818101 G
#> 20 player 20 16.122196 1.3162579 G
由 reprex package (v2.0.1)
于 2021-10-16 创建
感谢您的宝贵时间
更新:
到目前为止我的逻辑是:
- 转置我的 df
df <- as.data.frame(t(df))
- 并创建 10 名玩家的所有可能组合
combn(df, 10, simplify=FALSE)
现在我需要select列出正确的角色和
总和在 95.5-100.4 之间。该死的应该有更聪明的方法。
我用了data.table。
选择(4, 2) * 选择(8, 4) * 选择(8, 4) = 29400 种组合。基本上是蛮力。我确定有更优雅的解决方案。
library(data.table)
library(magrittr)
set.seed(123)
players <- paste("player",rep(1:20))
score <- runif(20, min=4, max=16.7)
index <- runif(20, min=-1, max=9)
role <- rep(c("C","F","F","G","G"),4)
dt <- data.table(players, score, index,role)
centers <- dt[role == "C"]
forwards <- dt[role == "F"]
guards <- dt[role == "G"]
c_combos <- combn(nrow(centers), 2) %>% t() %>% as.data.table()
c_combos <- lapply(c_combos, function(x) centers[x])
c_combos <- data.table(c_combos[[1]]$players, c_combos[[2]]$players,
c_combos[[1]]$score + c_combos[[2]]$score,
c_combos[[1]]$index + c_combos[[2]]$index) %>%
setnames(c('C1', 'C2', 'score_c', 'index_c'))
f_combos <- combn(nrow(forwards), 4) %>% t() %>% as.data.table()
f_combos <- lapply(f_combos, function(x) forwards[x])
f_combos <- data.table(f_combos[[1]]$players, f_combos[[2]]$players,
f_combos[[3]]$players, f_combos[[4]]$players,
f_combos[[1]]$score + f_combos[[2]]$score +
f_combos[[3]]$score + f_combos[[4]]$score,
f_combos[[1]]$index + f_combos[[2]]$index +
f_combos[[3]]$index + f_combos[[4]]$index) %>%
setnames(c('F1', 'F2', 'F3', 'F4', 'score_f', 'index_f'))
g_combos <- combn(nrow(guards), 4) %>% t() %>% as.data.table()
g_combos <- lapply(g_combos, function(x) guards[x])
g_combos <- data.table(g_combos[[1]]$players, g_combos[[2]]$players,
g_combos[[3]]$players, g_combos[[4]]$players,
g_combos[[1]]$score + g_combos[[2]]$score +
g_combos[[3]]$score + g_combos[[4]]$score,
g_combos[[1]]$index + g_combos[[2]]$index +
g_combos[[3]]$index + g_combos[[4]]$index) %>%
setnames(c('G1', 'G2', 'G3', 'G4', 'score_g', 'index_g'))
combined <- expand.grid(1:nrow(c_combos), 1:nrow(f_combos), 1:nrow(g_combos))
ans <- rbindlist(lapply(1:nrow(combined),
function(x) data.table(c_combos[combined$Var1[x]],
f_combos[combined$Var2[x]],
g_combos[combined$Var3[x]])))
ans[, score := score_c + score_f + score_g]
ans[, index := index_c + index_f + index_g]
ans[, c('score_c', 'score_f', 'score_g', 'index_c', 'index_f', 'index_g') := NULL]
ans[score %between% c(99.5, 100.4), .SD[which.max(index)]]
给出:
C1 C2 F1 F2 F3 F4 G1 G2
1: player 1 player 6 player 3 player 7 player 12 player 17 player 4 player 5
G3 G4 score index
1: player 14 player 19 99.6097 62.07025
这是一种使用一些 tidyverse 和基本函数的方法。蛮力,但在这个规模下可能足够快(~0.2 秒)。
首先,我为每个位置制作球员号码向量,然后找出每个位置内的可能组合。然后用tidyr::crossing
得到那些组合的所有组合。通过旋转那些 long 并连接到原始数据,我们可以更容易地获得每个组合的总分,并将它们过滤到所需的范围。
看起来有 1,878 种组合低于 95.5,有 24,743 种组合高于 100.4,还有 2,779 种组合在所需范围内。
library(dplyr); library(tidyr)
df %>% split(.$role) -> df_split
C <- df_split$C$players
F <- df_split$F$players
G <- df_split$G$players
C_comb <- combn(C, 2) %>% t %>% as_tibble()
F_comb <- combn(F, 4) %>% t %>% as_tibble()
G_comb <- combn(G, 4) %>% t %>% as_tibble()
crossing(C_comb, F_comb, G_comb, .name_repair = "unique") %>%
mutate(sim_num = row_number()) %>%
pivot_longer(-sim_num) %>%
left_join(df, by = c("value" = "players")) %>%
group_by(sim_num) %>%
mutate(total_score = sum(score)) %>%
ungroup() %>%
filter(total_score >= 95.5, score <= 100.4)
我认为一种有效的方法是使用 Monte Carlo 方法(无需构建所有可能组合的完整数据集),我们在通过随机抽样找到所需输出后停止搜索
repeat {
idx <- unlist(
Map(
sample,
split(1:nrow(df), df$role),
c(2, 4, 4)
)
)
s <- sum(df$score[idx])
if (s >= 95.5 & s <= 100.4) break
}
df[sort(idx), ]
我的目标
我的目标是根据 指数 找到 10 位玩家 的最佳组合,其中 总分 95.5-100.4.
详情
有一个重要的细节。从10名球员中,根据role列应该有2C(两个中锋),4F(四个前锋),4*G (四名警卫)。
Atm 我正在为循环而苦苦挣扎,但我确信 dplyr 包中有一些我遗漏的亮点。非常感谢任何帮助或指导。
set.seed(123)
players <- paste("player",rep(1:20))
score <- runif(20, min=4, max=16.7)
index <- runif(20, min=-1, max=9)
role <- rep(c("C","F","F","G","G"),4)
df <- data.frame(players, score, index,role)
df
#> players score index role
#> 1 player 1 7.652235 7.8953932 C
#> 2 player 2 14.011475 5.9280341 F
#> 3 player 3 9.194007 5.4050681 F
#> 4 player 4 15.214321 8.9426978 G
#> 5 player 5 15.943935 5.5570580 G
#> 6 player 6 4.578568 6.0853047 C
#> 7 player 7 10.706940 4.4406602 F
#> 8 player 8 15.333722 4.9414202 F
#> 9 player 9 11.003225 1.8915974 G
#> 10 player 10 9.799007 0.4711365 G
#> 11 player 11 16.151783 8.6302423 C
#> 12 player 12 9.757344 8.0229905 F
#> 13 player 13 12.605147 5.9070528 F
#> 14 player 14 11.272444 6.9546742 G
#> 15 player 15 5.307143 -0.7538632 G
#> 16 player 16 15.427777 3.7779597 C
#> 17 player 17 7.125314 6.5845954 F
#> 18 player 18 4.534156 1.1640794 F
#> 19 player 19 8.164593 2.1818101 G
#> 20 player 20 16.122196 1.3162579 G
由 reprex package (v2.0.1)
于 2021-10-16 创建感谢您的宝贵时间
更新:
到目前为止我的逻辑是:
- 转置我的 df
df <- as.data.frame(t(df))
- 并创建 10 名玩家的所有可能组合
combn(df, 10, simplify=FALSE)
现在我需要select列出正确的角色和 总和在 95.5-100.4 之间。该死的应该有更聪明的方法。
我用了data.table。 选择(4, 2) * 选择(8, 4) * 选择(8, 4) = 29400 种组合。基本上是蛮力。我确定有更优雅的解决方案。
library(data.table)
library(magrittr)
set.seed(123)
players <- paste("player",rep(1:20))
score <- runif(20, min=4, max=16.7)
index <- runif(20, min=-1, max=9)
role <- rep(c("C","F","F","G","G"),4)
dt <- data.table(players, score, index,role)
centers <- dt[role == "C"]
forwards <- dt[role == "F"]
guards <- dt[role == "G"]
c_combos <- combn(nrow(centers), 2) %>% t() %>% as.data.table()
c_combos <- lapply(c_combos, function(x) centers[x])
c_combos <- data.table(c_combos[[1]]$players, c_combos[[2]]$players,
c_combos[[1]]$score + c_combos[[2]]$score,
c_combos[[1]]$index + c_combos[[2]]$index) %>%
setnames(c('C1', 'C2', 'score_c', 'index_c'))
f_combos <- combn(nrow(forwards), 4) %>% t() %>% as.data.table()
f_combos <- lapply(f_combos, function(x) forwards[x])
f_combos <- data.table(f_combos[[1]]$players, f_combos[[2]]$players,
f_combos[[3]]$players, f_combos[[4]]$players,
f_combos[[1]]$score + f_combos[[2]]$score +
f_combos[[3]]$score + f_combos[[4]]$score,
f_combos[[1]]$index + f_combos[[2]]$index +
f_combos[[3]]$index + f_combos[[4]]$index) %>%
setnames(c('F1', 'F2', 'F3', 'F4', 'score_f', 'index_f'))
g_combos <- combn(nrow(guards), 4) %>% t() %>% as.data.table()
g_combos <- lapply(g_combos, function(x) guards[x])
g_combos <- data.table(g_combos[[1]]$players, g_combos[[2]]$players,
g_combos[[3]]$players, g_combos[[4]]$players,
g_combos[[1]]$score + g_combos[[2]]$score +
g_combos[[3]]$score + g_combos[[4]]$score,
g_combos[[1]]$index + g_combos[[2]]$index +
g_combos[[3]]$index + g_combos[[4]]$index) %>%
setnames(c('G1', 'G2', 'G3', 'G4', 'score_g', 'index_g'))
combined <- expand.grid(1:nrow(c_combos), 1:nrow(f_combos), 1:nrow(g_combos))
ans <- rbindlist(lapply(1:nrow(combined),
function(x) data.table(c_combos[combined$Var1[x]],
f_combos[combined$Var2[x]],
g_combos[combined$Var3[x]])))
ans[, score := score_c + score_f + score_g]
ans[, index := index_c + index_f + index_g]
ans[, c('score_c', 'score_f', 'score_g', 'index_c', 'index_f', 'index_g') := NULL]
ans[score %between% c(99.5, 100.4), .SD[which.max(index)]]
给出:
C1 C2 F1 F2 F3 F4 G1 G2
1: player 1 player 6 player 3 player 7 player 12 player 17 player 4 player 5
G3 G4 score index
1: player 14 player 19 99.6097 62.07025
这是一种使用一些 tidyverse 和基本函数的方法。蛮力,但在这个规模下可能足够快(~0.2 秒)。
首先,我为每个位置制作球员号码向量,然后找出每个位置内的可能组合。然后用tidyr::crossing
得到那些组合的所有组合。通过旋转那些 long 并连接到原始数据,我们可以更容易地获得每个组合的总分,并将它们过滤到所需的范围。
看起来有 1,878 种组合低于 95.5,有 24,743 种组合高于 100.4,还有 2,779 种组合在所需范围内。
library(dplyr); library(tidyr)
df %>% split(.$role) -> df_split
C <- df_split$C$players
F <- df_split$F$players
G <- df_split$G$players
C_comb <- combn(C, 2) %>% t %>% as_tibble()
F_comb <- combn(F, 4) %>% t %>% as_tibble()
G_comb <- combn(G, 4) %>% t %>% as_tibble()
crossing(C_comb, F_comb, G_comb, .name_repair = "unique") %>%
mutate(sim_num = row_number()) %>%
pivot_longer(-sim_num) %>%
left_join(df, by = c("value" = "players")) %>%
group_by(sim_num) %>%
mutate(total_score = sum(score)) %>%
ungroup() %>%
filter(total_score >= 95.5, score <= 100.4)
我认为一种有效的方法是使用 Monte Carlo 方法(无需构建所有可能组合的完整数据集),我们在通过随机抽样找到所需输出后停止搜索
repeat {
idx <- unlist(
Map(
sample,
split(1:nrow(df), df$role),
c(2, 4, 4)
)
)
s <- sum(df$score[idx])
if (s >= 95.5 & s <= 100.4) break
}
df[sort(idx), ]