使用 sparklyr 或 dplyr 获取组组合的成员计数
Getting counts of membership in combination of groups using sparklyr or dplyr
我有一个 spark 数据框,我正在使用 sparklyr 进行操作,如下所示:
input_data <- data.frame(id = c(10,10,10,20,20,30,30,40,40,40,50,60,70, 80,80,80,100,100,110,110,120,120,120,130,140,150,160,170),
date = c("2021-01-01","2021-01-02","2021-01-03","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-02","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-05","2021-01-01","2021-01-02","2021-01-03","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-02","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-05","2021-01-01","2021-01-05"),
group = c("A", "B", "C", "B", "C", "A", "C", "A", "A", "A", "C", "A","B","A", "B", "C", "B", "C", "A", "C", "A", "A", "A", "C", "A", "A", "B","A"),
event = c(1,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,1,0,0,1,1,1,1,1,0))
我想聚合数据,这样我就有了“事件”(event == 1
)和“non_events”(event == 0
)的数量对于每个组合,最终输出如下所示:
data.frame(group_a = c(1,0,0,1,0,1),
group_b = c(0,1,0,1,1,0),
group_c = c(0,0,1,0,1,1),
event_occured = c(3,1,2,0,2,2),
event_not_occured = c(4,2,2,0,2,2))
因此,例如,没有组合,其中 A 和 B 是同一 ID 的组,因此 event
和 non_event
得到 0。 A组涉及的ID有4个,其中3个event
,1个non_event
,以此类推
使用 sparklyr(或 dplyr 或 pyspark)的哪种方法可以进行上述聚合?我尝试了以下方法,但我得到的 event
的数量与 event_not_occurred
的数量完全相同,所以我一定是做错了什么但无法查明:
combo_path_sdf <- input_data %>%
group_by(id) %>%
arrange(date) %>%
mutate(order_seq = ifelse(event > 0, 1, NA)) %>%
mutate(order_seq = lag(cumsum(ifelse(is.na(order_seq), 0, order_seq)))) %>%
mutate(order_seq = ifelse((row_number() == 1) & (event > 0), -1, ifelse(row_number() == 1, 0, order_seq))) %>%
ungroup()
combo_path_sdf %>%
group_by(id, order_seq) %>%
summarize(group_a = max(ifelse(group_a == "A", 1, 0)),
group_b = max(ifelse(group_b == "B", 1, 0)),
group_c = max(ifelse(group_c == "C", 1, 0)),
events = sum(event)) %>%
group_by(order_seq, group_a, group_b, group_c) %>%
summarize(event = sum(events),
total_sequences = n()) %>%
mutate(event_not_occured = total_sequences - event)
最终输出如下格式也可以:
data.frame(group_a = c("A", "B", "C", "A,B", "B,C", "A,C"),
event_occured = c(3,1,2,1,2,2),
event_not_occured = c(4,2,2,1,2,2))
(下图 A、B 不正确,应该是 1,1 而不是 0,0)
以下匹配您请求的输出格式,并以我理解您想要的方式处理数据,但是(根据@Martin Gal 的评论)与您提供的示例结果不匹配。
input_data %>%
group_by(id) %>%
summarise(group_a = max(ifelse(group == 'A', 1, 0)),
group_b = max(ifelse(group == 'B', 1, 0)),
group_c = max(ifelse(group == 'C', 1, 0)),
event_occured = sum(ifelse(event == 1, 1, 0)),
event_not_occured = sum(ifelse(event == 0, 1, 0)),
.groups = "drop") %>%
group_by(group_a, group_b, group_c) %>%
summarise(event_occured = sum(event_occured),
event_not_occured = sum(event_not_occured),
.groups = "drop")
这个想法是一个两步总结过程。第一个总结为每个事件的组创建一个指标,并计算 events/non-events 的数量。第二个总结,合并所有相似的组。
关于您使用的产生相同数量事件和非事件的代码。看看hts_combined
。这未在您共享的代码中定义,因此您的脚本可能正在从其他地方读取变量。
我有一个 spark 数据框,我正在使用 sparklyr 进行操作,如下所示:
input_data <- data.frame(id = c(10,10,10,20,20,30,30,40,40,40,50,60,70, 80,80,80,100,100,110,110,120,120,120,130,140,150,160,170),
date = c("2021-01-01","2021-01-02","2021-01-03","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-02","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-05","2021-01-01","2021-01-02","2021-01-03","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-02","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-05","2021-01-01","2021-01-05"),
group = c("A", "B", "C", "B", "C", "A", "C", "A", "A", "A", "C", "A","B","A", "B", "C", "B", "C", "A", "C", "A", "A", "A", "C", "A", "A", "B","A"),
event = c(1,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,1,0,0,1,1,1,1,1,0))
我想聚合数据,这样我就有了“事件”(event == 1
)和“non_events”(event == 0
)的数量对于每个组合,最终输出如下所示:
data.frame(group_a = c(1,0,0,1,0,1),
group_b = c(0,1,0,1,1,0),
group_c = c(0,0,1,0,1,1),
event_occured = c(3,1,2,0,2,2),
event_not_occured = c(4,2,2,0,2,2))
因此,例如,没有组合,其中 A 和 B 是同一 ID 的组,因此 event
和 non_event
得到 0。 A组涉及的ID有4个,其中3个event
,1个non_event
,以此类推
使用 sparklyr(或 dplyr 或 pyspark)的哪种方法可以进行上述聚合?我尝试了以下方法,但我得到的 event
的数量与 event_not_occurred
的数量完全相同,所以我一定是做错了什么但无法查明:
combo_path_sdf <- input_data %>%
group_by(id) %>%
arrange(date) %>%
mutate(order_seq = ifelse(event > 0, 1, NA)) %>%
mutate(order_seq = lag(cumsum(ifelse(is.na(order_seq), 0, order_seq)))) %>%
mutate(order_seq = ifelse((row_number() == 1) & (event > 0), -1, ifelse(row_number() == 1, 0, order_seq))) %>%
ungroup()
combo_path_sdf %>%
group_by(id, order_seq) %>%
summarize(group_a = max(ifelse(group_a == "A", 1, 0)),
group_b = max(ifelse(group_b == "B", 1, 0)),
group_c = max(ifelse(group_c == "C", 1, 0)),
events = sum(event)) %>%
group_by(order_seq, group_a, group_b, group_c) %>%
summarize(event = sum(events),
total_sequences = n()) %>%
mutate(event_not_occured = total_sequences - event)
最终输出如下格式也可以:
data.frame(group_a = c("A", "B", "C", "A,B", "B,C", "A,C"),
event_occured = c(3,1,2,1,2,2),
event_not_occured = c(4,2,2,1,2,2))
(下图 A、B 不正确,应该是 1,1 而不是 0,0)
以下匹配您请求的输出格式,并以我理解您想要的方式处理数据,但是(根据@Martin Gal 的评论)与您提供的示例结果不匹配。
input_data %>%
group_by(id) %>%
summarise(group_a = max(ifelse(group == 'A', 1, 0)),
group_b = max(ifelse(group == 'B', 1, 0)),
group_c = max(ifelse(group == 'C', 1, 0)),
event_occured = sum(ifelse(event == 1, 1, 0)),
event_not_occured = sum(ifelse(event == 0, 1, 0)),
.groups = "drop") %>%
group_by(group_a, group_b, group_c) %>%
summarise(event_occured = sum(event_occured),
event_not_occured = sum(event_not_occured),
.groups = "drop")
这个想法是一个两步总结过程。第一个总结为每个事件的组创建一个指标,并计算 events/non-events 的数量。第二个总结,合并所有相似的组。
关于您使用的产生相同数量事件和非事件的代码。看看hts_combined
。这未在您共享的代码中定义,因此您的脚本可能正在从其他地方读取变量。