使用 dplyr 和 mutate 创建基于组和最后 n 行的新列

Using dplyr and mutate to create new columns based on groups and last n rows

我以下面的数据框为例:

match_id <- c("match_1", "match_1","match_1","match_2","match_2","match_2","match_3","match_3","match_3", "match_4","match_4","match_4")
player_id <- c("player_1", "player_2", "player_3", "player_1", "player_2", "player_3", "player_1", "player_2", "player_3","player_1", "player_2", "player_3")
venue <- c("venue A", "venue A", "venue A", "venue B", "venue B", "venue B", "venue C", "venue C", "venue C","venue C", "venue C", "venue C")
opponent <- c("opponent A", "opponent A", "opponent A", "opponent B", "opponent B", "opponent B", "opponent C", "opponent C", "opponent C","opponent C", "opponent C", "opponent C")
points <- c(5,10,15,1,2,3,5,7,9,11,2,6)

data <- data.frame(match_id, player_id, venue, opponent, points)

我希望创建新的列,以根据分组显示点数列中的最后 n 个值。

第一组我的尝试,以3局为例:

#group by player, give the last n games
library(dplyr)
data <- data %>%
  arrange(player_id,desc(match_id)) %>%
  mutate(last3_games = lag(points, n=3)) %>%
  mutate(last2_games = lag(points, n=2)) %>%
  mutate(last1_games = lag(points, n=1))

给出:

 head(data)
  match_id player_id   venue   opponent points last3_games last2_games last1_games
1  match_4  player_1 venue C opponent C     11    NA         NA          NA
2  match_3  player_1 venue C opponent C      6    NA         NA          11
3  match_2  player_1 venue B opponent B      1    NA         11          5
4  match_1  player_1 venue A opponent A      5    11         5           1
5  match_4  player_2 venue C opponent C      2     5         1           5
6  match_3  player_2 venue C opponent C      7     1         5           2
7   match_2  player_2 venue B opponent B     3     5         2           7
8   match_1  player_2 venue A opponent A     10    2         7           2

但我希望显示的值是每个玩家的最后 3 个点值,如下所示:

   match_id player_id   venue   opponent points last3_games last2_games last1_games
1   match_4  player_1 venue C opponent C     11    1         6           11
2   match_3  player_1 venue C opponent C      6    5         1           6
3   match_2  player_1 venue B opponent B      1    NA        5           1
4   match_1  player_1 venue A opponent A      5    NA        NA          5
5   match_4  player_2 venue C opponent C      2    3         7           2
6   match_3  player_2 venue C opponent C      7    10        3           7
7   match_2  player_2 venue B opponent B      3    NA        10          3
8   match_1  player_2 venue A opponent A     10    NA        NA          10

然后我也想通过对手和场地做同样的事情:

#by opponent
       match_id player_id   venue   opponent points last3_opponent last2_opponent last1_opponent
1  match_4  player_1 venue C opponent C     11             NA             5             11
2  match_3  player_1 venue C opponent C      5             NA             NA            5
3  match_2  player_1 venue B opponent B      1             NA             NA            1
4  match_1  player_1 venue A opponent A      5             NA             NA            5
5  match_4  player_2 venue C opponent C      2             NA             7             2
6  match_3  player_2 venue C opponent C      7             NA             NA            7
7  match_2  player_2 venue B opponent B      2             NA             NA            2
8  match_1  player_2 venue A opponent A     10             NA             NA            10

这里有很多NA's,但这是因为我只展示了一个小例子。

我的尝试是再次将它们分组并使用滞后进行变异,但它没有提供所需的输出:

data <- data %>%
  arrange(player_id,desc(match_id), opponent) %>%
  mutate(last3_opponent = lag(points, n=3)) %>%
  mutate(last2_opponent = lag(points, n=2)) %>%
  mutate(last1_opponent = lag(points, n=1))

按对手和场地分组的输出应如下所示:

    #by opponent and venue
  match_id player_id   venue   opponent points last3_opp_ven last2_opp_ven last1_opp_ven
1  match_1  player_1 venue A opponent A      5            NA            NA            5
2  match_2  player_1 venue B opponent B      1            NA            NA            1
3  match_4  player_1 venue C opponent C     11            NA             5            11
4  match_3  player_1 venue C opponent C      5            NA            NA            5
5  match_1  player_2 venue A opponent A     10            NA            NA            10
6  match_2  player_2 venue B opponent B      2            NA            NA            2
7  match_4  player_2 venue C opponent C      2            NA            7             2
8  match_3  player_2 venue C opponent C      7            NA            NA            7

理想情况下,我正在寻找最近 10 场比赛(最后 10 场、最后 9 场、最后 8 场等),并且由于有很多对手和很多 vanues,因此生成的数据框将有很多列。

有没有更简单的方法?

我还想在每个组的末尾添加一列,其中包含所有值的组合,对于每个组,就像这样(使用游戏的第一个示例):

   match_id player_id   venue   opponent points last3_combined
1   match_4  player_1 venue C opponent C     11    1,6,11        
2   match_3  player_1 venue C opponent C      6    5,1,6         
3   match_2  player_1 venue B opponent B      1    NA,5,1        
4   match_1  player_1 venue A opponent A      5    NA,NA,5        
5   match_4  player_2 venue C opponent C      2    3,7,2         
6   match_3  player_2 venue C opponent C      7    10,3,7       
7   match_2  player_2 venue B opponent B      3    NA,10,3       
8   match_1  player_2 venue A opponent A     10    NA,NA,10        

更新:

假设我想使用另一个变量(在本例中 game_x,作为分组依据,但这个变量只有两个值(1 和 0)。

我尝试了以下方法:

match_id <- c("match_1", "match_1","match_1","match_2","match_2","match_2","match_3","match_3","match_3", "match_4","match_4","match_4")
player_id <- c("player_1", "player_2", "player_3", "player_1", "player_2", "player_3", "player_1", "player_2", "player_3","player_1", "player_2", "player_3")
venue <- c("venue A", "venue A", "venue A", "venue B", "venue B", "venue B", "venue C", "venue C", "venue C","venue C", "venue C", "venue C")
opponent <- c("opponent A", "opponent A", "opponent A", "opponent B", "opponent B", "opponent B", "opponent C", "opponent C", "opponent C","opponent C", "opponent C", "opponent C")
game_x <- c(1,1,0,1,0,1,1,0,0,1,0,0)
points <- c(5,10,15,1,2,3,5,7,9,11,2,6)

data <- data.frame(match_id, player_id, venue, opponent, game_x, points)

library(data.table)

setDT(data)

f <- function(x,n=3) lapply(n:1,function(i) x[i:(i+length(x)-1)])

data[
  order(player_id, game_x, -match_id),
  c("last3", "last2", "last1"):=f(points,3),
  by=.(player_id, game_x)][]

但它会产生这个:

    match_id player_id   venue   opponent game_x points last3 last2 last1
 1:  match_1  player_1 venue A opponent A      1      5    NA    NA     5
 2:  match_1  player_2 venue A opponent A      1     10    NA    NA    10
 3:  match_1  player_3 venue A opponent A      0     15    NA    NA    15
 4:  match_2  player_1 venue B opponent B      1      1    NA     5     1
 5:  match_2  player_2 venue B opponent B      0      2    NA    NA     2
 6:  match_2  player_3 venue B opponent B      1      3    NA    NA     3
 7:  match_3  player_1 venue C opponent C      1      5     5     1     5
 8:  match_3  player_2 venue C opponent C      0      7    NA     2     7
 9:  match_3  player_3 venue C opponent C      0      9    NA    15     9
10:  match_4  player_1 venue C opponent C      1     11     1     5    11
11:  match_4  player_2 venue C opponent C      0      2     2     7     2
12:  match_4  player_3 venue C opponent C      0      6    15     9     6

什么时候应该是这样的:

    match_id player_id   venue   opponent game_x points last3 last2 last1
 1:  match_4  player_1 venue C opponent C      1     11     1     5    11
 2:  match_3  player_1 venue C opponent C      1      5     5     1     5
 3:  match_2  player_1 venue B opponent B      1      1    NA     5     1
 4:  match_1  player_1 venue A opponent A      1      5    NA    NA     5
 5:  match_4  player_2 venue C opponent C      0      2    NA    NA    NA
 6:  match_3  player_2 venue C opponent C      0      7    10    NA    NA
 7:  match_2  player_2 venue B opponent B      0      2    NA    10    NA
 8:  match_1  player_2 venue A opponent A      1     10    NA    NA    10
 9:  match_4  player_3 venue C opponent C      0      6    NA    NA     3
10:  match_3  player_3 venue C opponent C      0      9    NA    3     NA
11:  match_2  player_3 venue B opponent B      1      3    NA    NA     3
12:  match_1  player_3 venue A opponent A      0     15    NA    NA    NA

我做错了什么?

更新二:

我试图跳过 game_x = 0 处的 NA(并将 game_x = 0 处的所有值替换为 NA)。

现在似乎正在发生的事情是,即使 game_x = 0,NA 也会出现在 last3、last2 等的列中。以最后 5 个为例:

match_id <- c(1,2,3,4,5)
player_id <- c("player_1", "player_1", "player_1", "player_1", "player_1")
venue <- c("venue A", "venue A", "venue B", "venue B", "venue B")
opponent <- c("opponent A", "opponent B", "opponent A", "opponent C", "opponent C")
game_x <- c(1,1,0,1,0)
points <- c(5,10,15,1,2)

data <- data.frame(match_id, player_id, venue, opponent, game_x, points)

library(data.table)

setDT(data)

f <- function(x,n=3,m=rep(TRUE,length(x))) {
  x[!m] <- NA
  lapply(n:1,function(i) x[i:(i+length(x)-1)])
}

data[
  order(-match_id),
  c("last5", "last4", "last3", "last2", "last1"):=f(points,5,game_x==1),
  by=.(player_id)][order(player_id,-match_id)][]

data[order(-match_id)]

输出:

   match_id player_id   venue   opponent game_x points last5 last4 last3 last2 last1
1:        5  player_1 venue B opponent C      0      2     5    10    NA     1    NA
2:        4  player_1 venue B opponent C      1      1    NA     5    10    NA     1
3:        3  player_1 venue B opponent A      0     15    NA    NA     5       10    NA
4:        2  player_1 venue A opponent B      1     10    NA    NA    NA     5    10
5:        1  player_1 venue A opponent A      1      5    NA    NA    NA    NA     5

应该看起来像:

   match_id player_id   venue   opponent game_x points last5 last4 last3 last2 last1
1:        5  player_1 venue B opponent C      0      2   NA   NA    NA     NA   NA
2:        4  player_1 venue B opponent C      1      1   NA   NA    5    10     1
3:        3  player_1 venue B opponent A      0     15   NA   NA    NA    NA    NA
4:        2  player_1 venue A opponent B      1     10   NA   NA    NA     5    10
5:        1  player_1 venue A opponent A      1      5   NA   NA    NA    NA     5

您也许可以这样做:

  1. 设置数据为data.table
setDT(data)
  1. 创建一个 returns 向量列表的小函数,显示连续的最后 n 个点,给定一个向量作为输入
f <- function(x,n=3) lapply(n:1,\(i) x[i:(i+length(x)-1)])
  1. 按感兴趣的分组变量应用该函数,记住先排序。例如,要获得先验点数,只需通过玩家 ID,您可以使用 f(),如下所示:
data[
  order(-match_id),
  c("last3", "last2", "last1"):=f(points,3),
  by=player_id][]
  1. 如果您还想按场地和对手分组,请执行以下操作:
data[
  order(-match_id),
  c("last3", "last2", "last1"):=f(points,3),
  by=.(player_id, venue,opponent)][]

输出(player_id):

    match_id player_id   venue   opponent points last3 last2 last1
      <char>    <char>  <char>     <char>  <num> <num> <num> <num>
 1:  match_1  player_1 venue A opponent A      5    NA    NA     5
 2:  match_1  player_2 venue A opponent A     10    NA    NA    10
 3:  match_1  player_3 venue A opponent A     15    NA    NA    15
 4:  match_2  player_1 venue B opponent B      1    NA     5     1
 5:  match_2  player_2 venue B opponent B      2    NA    10     2
 6:  match_2  player_3 venue B opponent B      3    NA    15     3
 7:  match_3  player_1 venue C opponent C      5     5     1     5
 8:  match_3  player_2 venue C opponent C      7    10     2     7
 9:  match_3  player_3 venue C opponent C      9    15     3     9
10:  match_4  player_1 venue C opponent C     11     1     5    11
11:  match_4  player_2 venue C opponent C      2     2     7     2
12:  match_4  player_3 venue C opponent C      6     3     9     6

如果你想要组合列,你可以这样做,假设你将上面的结果分配给r1

r1[, combined:=paste(last3,last2,last1,sep = ","), by=1:nrow(r1)][]

输出:

    match_id player_id   venue   opponent points last3 last2 last1 combined
      <char>    <char>  <char>     <char>  <num> <num> <num> <num>   <char>
 1:  match_1  player_1 venue A opponent A      5    NA    NA     5  NA,NA,5
 2:  match_1  player_2 venue A opponent A     10    NA    NA    10 NA,NA,10
 3:  match_1  player_3 venue A opponent A     15    NA    NA    15 NA,NA,15
 4:  match_2  player_1 venue B opponent B      1    NA     5     1   NA,5,1
 5:  match_2  player_2 venue B opponent B      2    NA    10     2  NA,10,2
 6:  match_2  player_3 venue B opponent B      3    NA    15     3  NA,15,3
 7:  match_3  player_1 venue C opponent C      5     5     1     5    5,1,5
 8:  match_3  player_2 venue C opponent C      7    10     2     7   10,2,7
 9:  match_3  player_3 venue C opponent C      9    15     3     9   15,3,9
10:  match_4  player_1 venue C opponent C     11     1     5    11   1,5,11
11:  match_4  player_2 venue C opponent C      2     2     7     2    2,7,2
12:  match_4  player_3 venue C opponent C      6     3     9     6    3,9,6

这是所需的最少代码集:

library(data.table)

setDT(data)

f <- function(x,n=3) lapply(n:1,\(i) x[i:(i+length(x)-1)])

data[order(-match_id),c("last3", "last2", "last1"):=f(points,3),by=player_id]
data[, combined:=paste(last3,last2,last1,sep = ","), by=1:nrow(data)]

更新-

OP 现在想要在特定条件下排除某些行(跳过那些行)。如果可以将掩码传递给 f(),表示要包含哪些行,那么,我们可以这样调整 f()

f <- function(x,n=3,m=rep(TRUE,length(x))) {
  x[!m] <- NA
  lapply(n:1,function(i) x[i:(i+length(x)-1)])
}

此示例使用上述 f() 的调整版本来跳过 game_x==0

的行
data[
  order(-match_id),
  c("last3", "last2", "last1"):=f(points,3,game_x==1),
  by=.(player_id)][order(player_id,-match_id)][]

另一个更新!,

现在 OP 想要完全排除 game_x=0 行。

rbind(
  data[game_x==0], 
  data[game_x==1][
  order(-match_id),
  c("last5", "last4", "last3", "last2", "last1"):=f(points,5),
  by=.(player_id)][order(player_id,-match_id)],
  fill=TRUE
)

输出:

   match_id player_id   venue   opponent game_x points last5 last4 last3 last2 last1
      <num>    <char>  <char>     <char>  <num>  <num> <num> <num> <num> <num> <num>
1:        3  player_1 venue B opponent A      0     15    NA    NA    NA    NA    NA
2:        5  player_1 venue B opponent C      0      2    NA    NA    NA    NA    NA
3:        4  player_1 venue B opponent C      1      1    NA    NA     5    10     1
4:        2  player_1 venue A opponent B      1     10    NA    NA    NA     5    10
5:        1  player_1 venue A opponent A      1      5    NA    NA    NA    NA     5