如何在 tibble 列中找到最长的重复序列?
How to find the longest duplicate sequence in a tibble column?
我更新了我的问题,因为我的输出表还需要一列。
我有以下问题:
library(tibble)
my_tbl <- tribble(
~year, ~event_id, ~winner_id,
2011, "A", 4322,
2012, "A", 4322,
2013, "A", 4322,
2014, "A", 5478,
2015, "A", 4322,
2011, "B", 4322,
2012, "B", 7893,
2013, "B", 7893,
2014, "B", 2365,
2015, "B", 3407,
2011, "C", 5556,
2012, "C", 5556,
2013, "C", 1238,
2014, "C", 2391,
2015, "C", 2391,
2011, "D", 4219,
2012, "D", 7623,
2013, "D", 8003,
2014, "D", 2851,
2015, "D", 0418
)
我想按事件 ID 找出最多的连续获胜次数。我正在寻找的结果如下所示:
results_summary_tbl <- tribble(
~event_id, ~most_wins_in_a_row, ~number_of_winners, ~winners, ~years,
"A", 3, 1, "4322", "4322 = (2011, 2012, 2013)",
"C", 2, 2, "5556 , 2391", "5556 = (2011, 2012), 2391 = (2014, 2015)",
"B", 2, 1, "7893", "7893 = (2012, 2013)",
"D", 1, 5, "4219 , 7623 , 8003 , 2851 , 0418", "4219 = (2011), 7623 = (2012), 8003 = (2013), 2851 = (2014), 0418 = ( 2015)"
)
谢谢
在按 'event_id' 分组后获得最长相邻相似元素的一个选项是 rle
。它 returns lengths
的 list
和相应的 values
。通过创建逻辑。 max
'lengths' 的表达式,'values' 的子集以及长度
library(dplyr)
library(purrr)
my_tbl %>%
group_by(event_id) %>%
summarise(rl = list(rle(winner_id)),
most_wins_in_a_row = map_int(rl, ~ max(.x$lengths)),
number_of_winners = map2_int(rl, most_wins_in_a_row,
~ sum(.x$lengths == .y)),
winners = map2_chr(rl, most_wins_in_a_row,
~ toString(.x$values[.x$lengths == .y]))) %>%
select(-rl)
# A tibble: 4 x 4
# event_id most_wins_in_a_row number_of_winners winners
# <chr> <int> <int> <chr>
#1 A 3 1 4322
#2 B 2 1 7893
#3 C 2 2 5556, 2391
#4 D 1 5 4219, 7623, 8003, 2851, 418
一个dplyr
选项可以是:
my_tbl %>%
add_count(event_id, rleid = cumsum(winner_id != lag(winner_id, default = first(winner_id)))) %>%
group_by(event_id) %>%
summarise(most_wins_in_a_row = max(n),
number_of_winners = n_distinct(winner_id[n == max(n)]),
winners = paste0(unique(winner_id[n == max(n)]), collapse = ","))
event_id most_wins_in_a_row number_of_winners winners
<chr> <int> <int> <chr>
1 A 3 1 4322
2 B 2 1 7893
3 C 2 2 5556,2391
4 D 1 5 4219,7623,8003,2851,418
我更新了我的问题,因为我的输出表还需要一列。
我有以下问题:
library(tibble)
my_tbl <- tribble(
~year, ~event_id, ~winner_id,
2011, "A", 4322,
2012, "A", 4322,
2013, "A", 4322,
2014, "A", 5478,
2015, "A", 4322,
2011, "B", 4322,
2012, "B", 7893,
2013, "B", 7893,
2014, "B", 2365,
2015, "B", 3407,
2011, "C", 5556,
2012, "C", 5556,
2013, "C", 1238,
2014, "C", 2391,
2015, "C", 2391,
2011, "D", 4219,
2012, "D", 7623,
2013, "D", 8003,
2014, "D", 2851,
2015, "D", 0418
)
我想按事件 ID 找出最多的连续获胜次数。我正在寻找的结果如下所示:
results_summary_tbl <- tribble( ~event_id, ~most_wins_in_a_row, ~number_of_winners, ~winners, ~years, "A", 3, 1, "4322", "4322 = (2011, 2012, 2013)", "C", 2, 2, "5556 , 2391", "5556 = (2011, 2012), 2391 = (2014, 2015)", "B", 2, 1, "7893", "7893 = (2012, 2013)", "D", 1, 5, "4219 , 7623 , 8003 , 2851 , 0418", "4219 = (2011), 7623 = (2012), 8003 = (2013), 2851 = (2014), 0418 = ( 2015)" )
谢谢
在按 'event_id' 分组后获得最长相邻相似元素的一个选项是 rle
。它 returns lengths
的 list
和相应的 values
。通过创建逻辑。 max
'lengths' 的表达式,'values' 的子集以及长度
library(dplyr)
library(purrr)
my_tbl %>%
group_by(event_id) %>%
summarise(rl = list(rle(winner_id)),
most_wins_in_a_row = map_int(rl, ~ max(.x$lengths)),
number_of_winners = map2_int(rl, most_wins_in_a_row,
~ sum(.x$lengths == .y)),
winners = map2_chr(rl, most_wins_in_a_row,
~ toString(.x$values[.x$lengths == .y]))) %>%
select(-rl)
# A tibble: 4 x 4
# event_id most_wins_in_a_row number_of_winners winners
# <chr> <int> <int> <chr>
#1 A 3 1 4322
#2 B 2 1 7893
#3 C 2 2 5556, 2391
#4 D 1 5 4219, 7623, 8003, 2851, 418
一个dplyr
选项可以是:
my_tbl %>%
add_count(event_id, rleid = cumsum(winner_id != lag(winner_id, default = first(winner_id)))) %>%
group_by(event_id) %>%
summarise(most_wins_in_a_row = max(n),
number_of_winners = n_distinct(winner_id[n == max(n)]),
winners = paste0(unique(winner_id[n == max(n)]), collapse = ","))
event_id most_wins_in_a_row number_of_winners winners
<chr> <int> <int> <chr>
1 A 3 1 4322
2 B 2 1 7893
3 C 2 2 5556,2391
4 D 1 5 4219,7623,8003,2851,418