如何在 tibble 列中找到最长的重复序列(跟进问题)?
How to find the longest duplicate sequence in a tibble column (follow up question)?
这是我在此处发布的问题的后续问题:
作为输出,我还需要一列(年):
library(tibble)
library(purrr)
my_tbl <- tribble(
~year, ~event_id, ~winner_id,
2011, "A", 4322,
2012, "A", 4322,
2013, "A", 4322,
2014, "A", 5478,
2015, "A", 4322,
2011, "B", 4322,
2012, "B", 7893,
2013, "B", 7893,
2014, "B", 2365,
2015, "B", 3407,
2011, "C", 5556,
2012, "C", 5556,
2013, "C", 1238,
2014, "C", 2391,
2015, "C", 2391,
2011, "D", 4219,
2012, "D", 7623,
2013, "D", 8003,
2014, "D", 2851,
2015, "D", 0418
)
results_summary_tbl <- tribble(
~event_id, ~most_wins_in_a_row, ~number_of_winners, ~winners, ~years,
"A", 3, 1, "4322", "4322 = (2011, 2012, 2013)",
"C", 2, 2, "5556 , 2391", "5556 = (2011, 2012), 2391 = (2014, 2015)",
"B", 2, 1, "7893", "7893 = (2012, 2013)",
"D", 1, 5, "4219 , 7623 , 8003 , 2851 , 0418", "4219 = (2011), 7623 = (2012), 8003 = (2013), 2851 = (2014), 0418 = (2015)"
)
谢谢,
继续@tmfmnk 上次post的方法,我们可以通过
实现预期的输出
library(dplyr)
my_tbl %>%
add_count(event_id, rleid = cumsum(winner_id != lag(winner_id, default = first(winner_id)))) %>%
group_by(event_id) %>%
mutate(most_wins_in_a_row = max(n),
number_of_winners = n_distinct(winner_id[n == max(n)]),
winners = paste0(unique(winner_id[n == max(n)]), collapse = ",")) %>%
group_by(event_id, winner_id) %>%
mutate(year = paste0(first(winner_id), "= (", toString(year), ")")) %>%
group_by(event_id) %>%
mutate(year = toString(unique(year))) %>%
slice(1L)
如果您将 pull(year)
添加到上述链中,您将看到输出为
#[1] "4322= (2011, 2012, 2013, 2015), 5478= (2014)"
#[2] "4322= (2011), 7893= (2012, 2013), 2365= (2014), 3407= (2015)"
#[3] "5556= (2011, 2012), 1238= (2013), 2391= (2014, 2015)"
#[4] "4219= (2011), 7623= (2012), 8003= (2013), 2851= (2014), 418= (2015)"
使用 data.table 您可以查看哪些行具有最高 rowid(rleid(winner_id))
,即连续重复的最高数量,并使用它来创建中间 table。从那里计算更容易。
library(data.table)
setDT(my_tbl)
my_tbl[, ro := rowid(rleid(winner_id, event_id))]
most_wins <- my_tbl[my_tbl[, .(ro = max(ro)), event_id], on = .(ro, event_id)]
most_wins[, year := Map(function(x, y) toString(x - y:1 + 1), year, ro)]
most_wins[, .(most_wins_in_a_row = ro[1],
number_of_winners = .N,
winners = toString(winner_id),
years = toString(paste0(winner_id, ' = (', year, ')')))
, by = event_id]
# event_id most_wins_in_a_row number_of_winners winners
# 1: A 3 1 4322
# 2: B 2 1 7893
# 3: C 2 2 5556, 2391
# 4: D 1 5 4219, 7623, 8003, 2851, 418
# years
# 1: 4322 = (2011, 2012, 2013)
# 2: 7893 = (2012, 2013)
# 3: 5556 = (2011, 2012), 2391 = (2014, 2015)
# 4: 4219 = (2011), 7623 = (2012), 8003 = (2013), 2851 = (2014), 418 = (2015)
这是我在此处发布的问题的后续问题:
作为输出,我还需要一列(年):
library(tibble)
library(purrr)
my_tbl <- tribble(
~year, ~event_id, ~winner_id,
2011, "A", 4322,
2012, "A", 4322,
2013, "A", 4322,
2014, "A", 5478,
2015, "A", 4322,
2011, "B", 4322,
2012, "B", 7893,
2013, "B", 7893,
2014, "B", 2365,
2015, "B", 3407,
2011, "C", 5556,
2012, "C", 5556,
2013, "C", 1238,
2014, "C", 2391,
2015, "C", 2391,
2011, "D", 4219,
2012, "D", 7623,
2013, "D", 8003,
2014, "D", 2851,
2015, "D", 0418
)
results_summary_tbl <- tribble(
~event_id, ~most_wins_in_a_row, ~number_of_winners, ~winners, ~years,
"A", 3, 1, "4322", "4322 = (2011, 2012, 2013)",
"C", 2, 2, "5556 , 2391", "5556 = (2011, 2012), 2391 = (2014, 2015)",
"B", 2, 1, "7893", "7893 = (2012, 2013)",
"D", 1, 5, "4219 , 7623 , 8003 , 2851 , 0418", "4219 = (2011), 7623 = (2012), 8003 = (2013), 2851 = (2014), 0418 = (2015)"
)
谢谢,
继续@tmfmnk 上次post的方法,我们可以通过
实现预期的输出library(dplyr)
my_tbl %>%
add_count(event_id, rleid = cumsum(winner_id != lag(winner_id, default = first(winner_id)))) %>%
group_by(event_id) %>%
mutate(most_wins_in_a_row = max(n),
number_of_winners = n_distinct(winner_id[n == max(n)]),
winners = paste0(unique(winner_id[n == max(n)]), collapse = ",")) %>%
group_by(event_id, winner_id) %>%
mutate(year = paste0(first(winner_id), "= (", toString(year), ")")) %>%
group_by(event_id) %>%
mutate(year = toString(unique(year))) %>%
slice(1L)
如果您将 pull(year)
添加到上述链中,您将看到输出为
#[1] "4322= (2011, 2012, 2013, 2015), 5478= (2014)"
#[2] "4322= (2011), 7893= (2012, 2013), 2365= (2014), 3407= (2015)"
#[3] "5556= (2011, 2012), 1238= (2013), 2391= (2014, 2015)"
#[4] "4219= (2011), 7623= (2012), 8003= (2013), 2851= (2014), 418= (2015)"
使用 data.table 您可以查看哪些行具有最高 rowid(rleid(winner_id))
,即连续重复的最高数量,并使用它来创建中间 table。从那里计算更容易。
library(data.table)
setDT(my_tbl)
my_tbl[, ro := rowid(rleid(winner_id, event_id))]
most_wins <- my_tbl[my_tbl[, .(ro = max(ro)), event_id], on = .(ro, event_id)]
most_wins[, year := Map(function(x, y) toString(x - y:1 + 1), year, ro)]
most_wins[, .(most_wins_in_a_row = ro[1],
number_of_winners = .N,
winners = toString(winner_id),
years = toString(paste0(winner_id, ' = (', year, ')')))
, by = event_id]
# event_id most_wins_in_a_row number_of_winners winners
# 1: A 3 1 4322
# 2: B 2 1 7893
# 3: C 2 2 5556, 2391
# 4: D 1 5 4219, 7623, 8003, 2851, 418
# years
# 1: 4322 = (2011, 2012, 2013)
# 2: 7893 = (2012, 2013)
# 3: 5556 = (2011, 2012), 2391 = (2014, 2015)
# 4: 4219 = (2011), 7623 = (2012), 8003 = (2013), 2851 = (2014), 418 = (2015)