基于dataframe进行加权计算

Question

我要计算一场比赛不同玩家的加权分数。我在前三列有他们的个人得分，在接下来的三列有他们的权重，这取决于比赛（每行代表一场比赛）。

谁能帮我有效地计算加权分数？

df <- data.frame(player1 = c(1,2,1), player2 = c(2,2,1), player3 = c(2,2,2), weightplayer1=c(0.7,0.8,0.7), weightplayer2 = c(0.6,0.1,0.6), weightplayer3=c(0.2,0.7,0.2))

#  player1 player2 player3 weightplayer1 weightplayer2 weightplayer3  
#       1       2       2      0.7           0.6            0.2
#       2       2       2      0.8           0.1            0.7
#       1       1       2      0.7           0.6            0.2

我需要这样的输出，其中 weighted1scores 列取权重之和得分为 1 的球员的数量，weighted2scores 列取权重之和得分 2 的球员。实际上我有一长串可能的得分，所以实际上这个数据框有很多列（直到 weighted100scores 等）因此，高效的 formula/loop 会很棒。


#  player1 player2 player3 weightplayer1 weightplayer2 weightplayer3  weighted1scores  weighted2scores
#       1       2       2      0.7           0.6            0.2              0.7              0.8  
#       2       2       2      0.8           0.1            0.7               0               1.6
#       1       1       2      0.7           0.6            0.2              1.3              0.2

Answer 1

您可以将表规范化为 3NF，然后连接和聚合：

library(tidyverse)

df <- data.frame(player1 = c(1, 2, 1), player2 = c(2, 2, 1), player3 = c(2, 2, 2), weightplayer1 = c(0.7, 0.8, 0.7), weightplayer2 = c(0.6, 0.1, 0.6), weightplayer3 = c(0.2, 0.7, 0.2))

scores <-
  df %>%
  as_tibble(rownames = "game") %>%
  pivot_longer(starts_with("player")) %>%
  transmute(
    game,
    player_id = name %>% str_extract("[0-9]+$"),
    score = value
  )
scores
#> # A tibble: 9 × 3
#>   game  player_id score
#>   <chr> <chr>     <dbl>
#> 1 1     1             1
#> 2 1     2             2
#> 3 1     3             2
#> 4 2     1             2
#> 5 2     2             2
#> 6 2     3             2
#> 7 3     1             1
#> 8 3     2             1
#> 9 3     3             2

weights <-
  df %>%
  as_tibble(rownames = "game") %>%
  pivot_longer(starts_with("weight")) %>%
  transmute(
    game,
    player_id = name %>% str_extract("[0-9]+$"),
    weight = value
  )
weights
#> # A tibble: 9 × 3
#>   game  player_id weight
#>   <chr> <chr>      <dbl>
#> 1 1     1            0.7
#> 2 1     2            0.6
#> 3 1     3            0.2
#> 4 2     1            0.8
#> 5 2     2            0.1
#> 6 2     3            0.7
#> 7 3     1            0.7
#> 8 3     2            0.6
#> 9 3     3            0.2

scores %>%
  inner_join(weights) %>%
  group_by(player_id, game) %>%
  summarise(weighted = sum(weight)) %>%
  pivot_wider(
    names_from = player_id,
    values_from = weighted,
    names_prefix = "weighted"
  )
#> Joining, by = c("game", "player_id")
#> `summarise()` has grouped output by 'player_id'. You can override using the `.groups` argument.
#> # A tibble: 3 × 4
#>   game  weighted1 weighted2 weighted3
#>   <chr>     <dbl>     <dbl>     <dbl>
#> 1 1           0.7       0.6       0.2
#> 2 2           0.8       0.1       0.7
#> 3 3           0.7       0.6       0.2

^{由 reprex package (v2.0.1)}

于 2021-09-20 创建

Answer 2

这是一种方法：

df %>%
  mutate(game = row_number()) %>%
  pivot_longer(cols = starts_with('player'), names_to = "Player", values_to = "Score") %>%
  pivot_longer(cols = starts_with('weightplayer'), names_to = "WPlayer", values_to = "Weight") %>%
  filter(parse_number(Player) == parse_number(WPlayer)) %>%
  select(-WPlayer) %>%
  mutate(
    WeightedScore = Score * Weight
  )

你可以保持原样 return 这样整洁 table

# A tibble: 9 x 5
   game Player  Score Weight WeightedScore
  <int> <chr>   <dbl>  <dbl>         <dbl>
1     1 player1     1    0.7           0.7
2     1 player2     2    0.6           1.2
3     1 player3     2    0.2           0.4
4     2 player1     2    0.8           1.6
5     2 player2     2    0.1           0.2
6     2 player3     2    0.7           1.4
7     3 player1     1    0.7           0.7
8     3 player2     1    0.6           0.6
9     3 player3     2    0.2           0.4

或继续：

df %>%
  mutate(game = row_number()) %>%
  pivot_longer(cols = starts_with('player'), names_to = "Player", values_to = "Score") %>%
  pivot_longer(cols = starts_with('weightplayer'), names_to = "WPlayer", values_to = "Weight") %>%
  filter(parse_number(Player) == parse_number(WPlayer)) %>%
  select(-WPlayer) %>%
  mutate(
    WeightedScore = Score * Weight
  ) %>%
  pivot_longer(cols = c(Score, Weight, WeightedScore)) %>%
  mutate(name = paste(Player, name, sep = '_')) %>%
  pivot_wider(id = game)

结束于：

# A tibble: 3 x 10
   game player1_Score player1_Weight player1_WeightedScore player2_Score player2_Weight player2_WeightedScore player3_Score player3_Weight player3_WeightedScore
  <int>         <dbl>          <dbl>                 <dbl>         <dbl>          <dbl>                 <dbl>         <dbl>          <dbl>                 <dbl>
1     1             1            0.7                   0.7             2            0.6                   1.2             2            0.2                   0.4
2     2             2            0.8                   1.6             2            0.1                   0.2             2            0.7                   1.4
3     3             1            0.7                   0.7             1            0.6                   0.6             2            0.2                   0.4

Answer 3

到目前为止，其他解决方案无法重现您的输出，可能是因为您没有按照我们预期的方式使用“权重”。

执行以下操作：

df <- data.frame(player1 = c(1,2,1), player2 = c(2,2,1), player3 = c(2,2,2), weightplayer1=c(0.7,0.8,0.7), weightplayer2 = c(0.6,0.1,0.6), weightplayer3=c(0.2,0.7,0.2))

library(tidyverse)
df <- df %>% mutate(game = row_number())
df %>% 
  pivot_longer(
    cols = player1:weightplayer3,
    names_to = c(".value", "player_id"),
    names_pattern = "(.+)(\d)") %>% 
  rename(score = player, weight = weightplayer, player = player_id) %>% 
  group_by(game, score_col = paste0("weighted",score,"score")) %>% 
  summarize(weightedscore = sum(weight)) %>% 
  pivot_wider(names_from = score_col, values_from = weightedscore, values_fill = 0) %>% 
  left_join(df, .) %>% 
  select(-game) %>% 
  as.data.frame() # just to print all columns
#> `summarise()` has grouped output by 'game'. You can override using the `.groups` argument.
#> Joining, by = "game"
#>   player1 player2 player3 weightplayer1 weightplayer2 weightplayer3
#> 1       1       2       2           0.7           0.6           0.2
#> 2       2       2       2           0.8           0.1           0.7
#> 3       1       1       2           0.7           0.6           0.2
#>   weighted1score weighted2score
#> 1            0.7            0.8
#> 2            0.0            1.6
#> 3            1.3            0.2

^{由 reprex package (v2.0.1)}

于 2021-09-20 创建

我们首先将数据重塑为整齐的数据（每行一个观察值），理想情况下我们应该留在那里直到报告，然后我们进行聚合计算，将其重塑为不整齐并将其缝合回原始 data.frame 。

Answer 4

这是另一种可能更简洁的方法：

library(tidyverse)

df %>%
  mutate(output = pmap(., ~ {x <- c(...)[startsWith(names(df), "player")]
  y <- c(...)[startsWith(names(df), "weight")]
  as_tibble(cbind(sum(y[x == 1]), sum(y[x == 2])))})) %>%
  unnest_wider(output) %>%
  rename_with(~ gsub("V(\d+)", "Weighted\1scores", .), starts_with("V"))

# A tibble: 3 x 8
  player1 player2 player3 weightplayer1 weightplayer2 weightplayer3 Weighted1scores
    <dbl>   <dbl>   <dbl>         <dbl>         <dbl>         <dbl>           <dbl>
1       1       2       2           0.7           0.6           0.2             0.7
2       2       2       2           0.8           0.1           0.7             0  
3       1       1       2           0.7           0.6           0.2             1.3
# ... with 1 more variable: Weighted2scores <dbl>

基于dataframe进行加权计算

Make weighted calculations based on dataframe

r

dataframe

weighted

dplyr

tidyverse