总结一组数据帧——改进一个笨拙的解决方案
Summarizing a collection of data frames - improving upon a clumsy solution
我有一组数据框,df_i
,代表一组患者第 i 次到医院就诊。我想总结每个数据框以确定第 i 次就诊时男性、女性和患者总数。虽然我可以解决这个问题,但我的解决方案很笨拙。有没有更简单的方法来获得我想要的最终数据框?示例如下:
df_1 <- data.frame(
ID = c(rep("A",4), rep("B",3), rep("C",2), "D"),
Dates = seq.Date(from = as.Date("2020-01-01"), to = as.Date("2020-01-10"), by = "day"),
Sex = c(rep("Male",4), rep("Male",3), rep("Female",2), "Female"),
Weight = seq(100, 190, 10),
Visit = rep(1, 10)
)
df_2 <- data.frame(
ID = c(rep("A",4), rep("B",3), rep("C",2)),
Dates = seq.Date(from = as.Date("2020-02-01"), to = as.Date("2020-02-9"), by = "day"),
Sex = c(rep("Male",4), rep("Male",3), rep("Female",2)),
Weight = seq(100, 180, 10),
Visit = rep(2, 5)
)
df_3 <- data.frame(
ID = c(rep("A",4), rep("B",3)),
Dates = seq.Date(from = as.Date("2020-03-01"), to = as.Date("2020-03-07"), by = "day"),
Sex = rep("Male",7),
Weight = seq(140, 200, 10),
Visit = rep(3, 7)
)
我希望生成以下结果:
> df_sum
Visit Patients Men Women
1 1 4 2 2
2 2 3 2 1
3 3 2 2 0
我可以用一种非常笨拙的方式来做到这一点:首先创建一个临时数据框来汇总 df_1
中的信息
df_tmp <- df_1 %>%
group_by(ID) %>%
filter(Dates == min(Dates)) %>%
summarize(n = n(), Men = sum(Sex == "Male"), Women = sum(Sex == "Female"))
> df_tmp
# A tibble: 4 x 4
ID n Men Women
<chr> <int> <int> <int>
1 A 1 1 0
2 B 1 1 0
3 C 1 0 1
4 D 1 0 1
接下来,对 df_tmp
中的每一列求和以创建摘要列的第一行。
r1 <- c(sum(df_tmp$n), sum(df_tmp$Men), sum(df_tmp$Women))
重复第二个和第三个数据帧。最后将行绑定在一起以创建摘要数据框。虽然这有效,但它非常笨拙,并且不能推广到我有可变访问次数的情况。有人可以为我的问题指出一个更优雅的解决方案吗?
非常感谢
托马斯·飞利浦
将数据放入列表中并通过 map
遍历它们,这样您就不必为每个数据帧重复代码。使用 janitor::adorn_totals
您可以在输出中添加一个包含总计的新行,并以宽格式获取数据。
library(tidyverse)
list_df <- list(df_1, df_2, df_3)
map_df(list_df, ~.x %>%
group_by(ID) %>%
filter(Dates == min(Dates)) %>%
ungroup %>%
count(Sex) %>%
janitor::adorn_totals(name = 'Patients'), .id = 'Visit') %>%
pivot_wider(names_from = Sex, values_from = n, values_fill = 0)
# Visit Female Male Patients
# <chr> <int> <int> <int>
#1 1 2 2 4
#2 2 1 2 3
#3 3 0 2 2
也可以用 bind_rows
:
library(tidyverse)
bind_rows(df_1, df_2, df_3, .id = "day") %>%
group_by(day, ID) %>%
slice_min(Dates) %>%
group_by(day) %>%
summarize(n = n(), Men = sum(Sex == "Male"), Women = sum(Sex == "Female"))
结果
# A tibble: 3 x 4
day n Men Women
* <chr> <int> <int> <int>
1 1 4 2 2
2 2 3 2 1
3 3 2 2 0
我有一组数据框,df_i
,代表一组患者第 i 次到医院就诊。我想总结每个数据框以确定第 i 次就诊时男性、女性和患者总数。虽然我可以解决这个问题,但我的解决方案很笨拙。有没有更简单的方法来获得我想要的最终数据框?示例如下:
df_1 <- data.frame(
ID = c(rep("A",4), rep("B",3), rep("C",2), "D"),
Dates = seq.Date(from = as.Date("2020-01-01"), to = as.Date("2020-01-10"), by = "day"),
Sex = c(rep("Male",4), rep("Male",3), rep("Female",2), "Female"),
Weight = seq(100, 190, 10),
Visit = rep(1, 10)
)
df_2 <- data.frame(
ID = c(rep("A",4), rep("B",3), rep("C",2)),
Dates = seq.Date(from = as.Date("2020-02-01"), to = as.Date("2020-02-9"), by = "day"),
Sex = c(rep("Male",4), rep("Male",3), rep("Female",2)),
Weight = seq(100, 180, 10),
Visit = rep(2, 5)
)
df_3 <- data.frame(
ID = c(rep("A",4), rep("B",3)),
Dates = seq.Date(from = as.Date("2020-03-01"), to = as.Date("2020-03-07"), by = "day"),
Sex = rep("Male",7),
Weight = seq(140, 200, 10),
Visit = rep(3, 7)
)
我希望生成以下结果:
> df_sum
Visit Patients Men Women
1 1 4 2 2
2 2 3 2 1
3 3 2 2 0
我可以用一种非常笨拙的方式来做到这一点:首先创建一个临时数据框来汇总 df_1
df_tmp <- df_1 %>%
group_by(ID) %>%
filter(Dates == min(Dates)) %>%
summarize(n = n(), Men = sum(Sex == "Male"), Women = sum(Sex == "Female"))
> df_tmp
# A tibble: 4 x 4
ID n Men Women
<chr> <int> <int> <int>
1 A 1 1 0
2 B 1 1 0
3 C 1 0 1
4 D 1 0 1
接下来,对 df_tmp
中的每一列求和以创建摘要列的第一行。
r1 <- c(sum(df_tmp$n), sum(df_tmp$Men), sum(df_tmp$Women))
重复第二个和第三个数据帧。最后将行绑定在一起以创建摘要数据框。虽然这有效,但它非常笨拙,并且不能推广到我有可变访问次数的情况。有人可以为我的问题指出一个更优雅的解决方案吗?
非常感谢
托马斯·飞利浦
将数据放入列表中并通过 map
遍历它们,这样您就不必为每个数据帧重复代码。使用 janitor::adorn_totals
您可以在输出中添加一个包含总计的新行,并以宽格式获取数据。
library(tidyverse)
list_df <- list(df_1, df_2, df_3)
map_df(list_df, ~.x %>%
group_by(ID) %>%
filter(Dates == min(Dates)) %>%
ungroup %>%
count(Sex) %>%
janitor::adorn_totals(name = 'Patients'), .id = 'Visit') %>%
pivot_wider(names_from = Sex, values_from = n, values_fill = 0)
# Visit Female Male Patients
# <chr> <int> <int> <int>
#1 1 2 2 4
#2 2 1 2 3
#3 3 0 2 2
也可以用 bind_rows
:
library(tidyverse)
bind_rows(df_1, df_2, df_3, .id = "day") %>%
group_by(day, ID) %>%
slice_min(Dates) %>%
group_by(day) %>%
summarize(n = n(), Men = sum(Sex == "Male"), Women = sum(Sex == "Female"))
结果
# A tibble: 3 x 4
day n Men Women
* <chr> <int> <int> <int>
1 1 4 2 2
2 2 3 2 1
3 3 2 2 0