在 tibble 列中找到最长的连续连胜
Finding the longest consecutive streak in a tibble column
我有以下问题:
my_tbl <- tribble(
~year, ~event_id, ~winner_id,
2011, "A", 4322,
2011, "A", 9604,
2011, "A", 1180,
2013, "A", 4322,
2013, "A", 9604,
2013, "A", 1663,
2014, "A", 4322,
2016, "A", 5478,
2017, "A", 4322,
2017, "A", 1663,
2011, "B", 4322,
2013, "B", 7893,
2013, "B", 1188,
2014, "B", 7893,
2016, "B", 2365,
2017, "B", 3407,
2011, "C", 5556,
2013, "C", 5556,
2014, "C", 1238,
2016, "C", 2391,
2017, "C", 2391,
2011, "D", 4219,
2013, "D", 7623,
2014, "D", 8003,
2016, "D", 2851,
2017, "D", 0418
)
我想按事件 ID 找出最多的连续获胜次数。我正在寻找的结果如下所示:
results_summary_tbl <- tribble(
~event_id, ~most_wins_in_a_row, ~number_of_winners, ~winners, ~years,
"A", 3, 1, "4322", "4322 = (2011, 2013, 2014)",
"C", 2, 2, "5556 , 2391", "5556 = (2011, 2013), 2391 = (2015, 2016)",
"B", 2, 1, "7893", "7893 = (2013, 2014)",
"D", 1, 5, "4219 , 7623 , 8003 , 2851 , 0418", "4219 = (2011), 7623 = (2013), 8003 = (2014), 2851 = (2016), 0418 = (2017)"
)
请注意缺少年份,因为这些年份没有发生事件。
我提供了以下代码,但由于缺少年份而无法使用:
my_tbl %>% arrange(event_id, winner_id, year) %>%
group_by(event_id, winner_id) %>%
mutate(run = cumsum(year - lag(year, default = first(year)) > 1)) %>%
count(event_id, winner_id, run) %>%
group_by(event_id) %>%
summarise(most_wins_in_a_row = max(n),
number_of_winners = sum(n == most_wins_in_a_row),
winners = paste0(winner_id[n == most_wins_in_a_row], collapse = ","))
我遇到了缺失年份的问题,所以我不得不使用带有 rle 的基本 R 方法来解决它。例如,我们取一个子集,其中 event_id=="A":
z = my_tbl[my_tbl$event_id =="A",]
要知道这个活动举办了多少年,谁赢了,我知道:
table(z$year,z$winner_id)
1180 1663 4322 5478 9604
2011 1 0 1 0 1
2013 0 1 1 0 1
2014 0 0 1 0 0
2016 0 0 0 1 0
2017 0 1 1 0 0
这简化了为每列查找最大连续 1 的问题。为此,我使用 rle,
apply(table(z$year,z$winner_id),2,function(i){
k=rle(i)
max(k$lengths[k$values == 1])
})
1180 1663 4322 5478 9604
1 1 3 1 2
显示 4322 具有最长的连胜,因为 event_id == 一个子集。有了这个,很容易把你需要的输出写成一个data.frame。剩下的就是将这个函数应用于所有数据子集:
library(purrr)
library(tidyr)
findLongestStreak = function(z){
TAB=table(z$year,z$winner_id)
max_per_id = apply(TAB,2,function(i){
k=rle(i)
MAX = max(k$lengths[k$values == 1])
YEARS = rownames(TAB)[which(rep(k$lengths==MAX,k$lengths) & i==1)[1:MAX]]
data.frame(
N=MAX,
YEARS=paste(YEARS,collapse=","))
})
max_per_id = do.call(rbind,max_per_id)
WINNERS = max_per_id$N == max(max_per_id$N)
data.frame(
event_id = z$event_id[1],
most_wins_in_a_row = max(max_per_id$N),
number_of_winners = sum(WINNERS),
winners=paste(rownames(max_per_id)[WINNERS],collapse=","),
years=paste(max_per_id$YEARS[WINNERS],collapse=";"),
stringsAsFactors=F
)
}
my_tbl %>% split(.$event_id) %>% map_dfr(findLongestStreak)
event_id most_wins_in_a_row number_of_winners winners
1 A 3 1 4322
2 B 3 1 7893
3 C 3 2 2391,5556
4 D 3 5 418,2851,4219,7623,8003
years
1 2011,2013,2014
2 2013,2014
3 2016,2017;2011,2013
4 2017;2016;2011;2013;2014
我有以下问题:
my_tbl <- tribble(
~year, ~event_id, ~winner_id,
2011, "A", 4322,
2011, "A", 9604,
2011, "A", 1180,
2013, "A", 4322,
2013, "A", 9604,
2013, "A", 1663,
2014, "A", 4322,
2016, "A", 5478,
2017, "A", 4322,
2017, "A", 1663,
2011, "B", 4322,
2013, "B", 7893,
2013, "B", 1188,
2014, "B", 7893,
2016, "B", 2365,
2017, "B", 3407,
2011, "C", 5556,
2013, "C", 5556,
2014, "C", 1238,
2016, "C", 2391,
2017, "C", 2391,
2011, "D", 4219,
2013, "D", 7623,
2014, "D", 8003,
2016, "D", 2851,
2017, "D", 0418
)
我想按事件 ID 找出最多的连续获胜次数。我正在寻找的结果如下所示:
results_summary_tbl <- tribble(
~event_id, ~most_wins_in_a_row, ~number_of_winners, ~winners, ~years,
"A", 3, 1, "4322", "4322 = (2011, 2013, 2014)",
"C", 2, 2, "5556 , 2391", "5556 = (2011, 2013), 2391 = (2015, 2016)",
"B", 2, 1, "7893", "7893 = (2013, 2014)",
"D", 1, 5, "4219 , 7623 , 8003 , 2851 , 0418", "4219 = (2011), 7623 = (2013), 8003 = (2014), 2851 = (2016), 0418 = (2017)"
)
请注意缺少年份,因为这些年份没有发生事件。
我提供了以下代码,但由于缺少年份而无法使用:
my_tbl %>% arrange(event_id, winner_id, year) %>%
group_by(event_id, winner_id) %>%
mutate(run = cumsum(year - lag(year, default = first(year)) > 1)) %>%
count(event_id, winner_id, run) %>%
group_by(event_id) %>%
summarise(most_wins_in_a_row = max(n),
number_of_winners = sum(n == most_wins_in_a_row),
winners = paste0(winner_id[n == most_wins_in_a_row], collapse = ","))
我遇到了缺失年份的问题,所以我不得不使用带有 rle 的基本 R 方法来解决它。例如,我们取一个子集,其中 event_id=="A":
z = my_tbl[my_tbl$event_id =="A",]
要知道这个活动举办了多少年,谁赢了,我知道:
table(z$year,z$winner_id)
1180 1663 4322 5478 9604
2011 1 0 1 0 1
2013 0 1 1 0 1
2014 0 0 1 0 0
2016 0 0 0 1 0
2017 0 1 1 0 0
这简化了为每列查找最大连续 1 的问题。为此,我使用 rle,
apply(table(z$year,z$winner_id),2,function(i){
k=rle(i)
max(k$lengths[k$values == 1])
})
1180 1663 4322 5478 9604
1 1 3 1 2
显示 4322 具有最长的连胜,因为 event_id == 一个子集。有了这个,很容易把你需要的输出写成一个data.frame。剩下的就是将这个函数应用于所有数据子集:
library(purrr)
library(tidyr)
findLongestStreak = function(z){
TAB=table(z$year,z$winner_id)
max_per_id = apply(TAB,2,function(i){
k=rle(i)
MAX = max(k$lengths[k$values == 1])
YEARS = rownames(TAB)[which(rep(k$lengths==MAX,k$lengths) & i==1)[1:MAX]]
data.frame(
N=MAX,
YEARS=paste(YEARS,collapse=","))
})
max_per_id = do.call(rbind,max_per_id)
WINNERS = max_per_id$N == max(max_per_id$N)
data.frame(
event_id = z$event_id[1],
most_wins_in_a_row = max(max_per_id$N),
number_of_winners = sum(WINNERS),
winners=paste(rownames(max_per_id)[WINNERS],collapse=","),
years=paste(max_per_id$YEARS[WINNERS],collapse=";"),
stringsAsFactors=F
)
}
my_tbl %>% split(.$event_id) %>% map_dfr(findLongestStreak)
event_id most_wins_in_a_row number_of_winners winners
1 A 3 1 4322
2 B 3 1 7893
3 C 3 2 2391,5556
4 D 3 5 418,2851,4219,7623,8003
years
1 2011,2013,2014
2 2013,2014
3 2016,2017;2011,2013
4 2017;2016;2011;2013;2014