基于两个标准求和和计数值
sum and count value based on two criteria
我无法解决以下问题。
Geslacht persondays age_cat contactfirst
1 V 365 <40 2020
2 V 365 <40 2019
3 V 365 70-80 2019
4 V 365 50-60 2019
5 V 365 60-70 2020
6 M 365 50-60 2020
7 V 365 60-70 2019
8 M 39 60-70 2019
9 V 365 60-70 2019
10 M 365 70-80 2020
df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V",
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M",
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M",
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M",
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365,
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L,
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L,
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L,
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L,
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80",
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L,
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L,
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L,
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L,
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011",
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018",
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
我想创建两个新变量。一个将每个 age_cat 和性别的人天数相加。第二个计算特定年份(在本例中为 2019 年)'contactfirst' 的数量
期望的输出:
Gender age_cat persondays_total contactfirst_total
V <40 730 1
V 50-60 365 1
V 60-70 1095 2
V 70-80 365 1
M 50-60 365 0
M 60-70 39 1
M 70-80 365 0
我已经尝试用 group_by
来做到这一点(代码如下)。但这计算了数据框中的所有人员日(因此不是按性别和 age_category)并且不会创建 newcontacts 列。
df2 <- df %>% group_by(Gender, age_cat) %>%
mutate(personyears_total = sum(persondays_individual)) %>%
mutate(newcontacts = nrow(df$contactfirst == "2019"
如有任何帮助,我们将不胜感激。
您应该使用 summarize
而不是 mutate
,因为您正在为每个组创建一些摘要。此外,使用 sum(contactfirst_cat == "2019")
来计算满足特定条件的记录,而不是 nrow()
,后者计算数据框中的行数。
library(dplyr)
df %>% group_by(Geslacht, age_cat) %>%
summarize(persondays_total = sum(persondays_individual),
contactfirst_total = sum(contactfirst_cat == "2019"),
.groups = "drop")
# A tibble: 11 × 4
Geslacht age_cat persondays_total contactfirst_total
<chr> <fct> <dbl> <int>
1 M <40 1095 1
2 M 40-50 1095 3
3 M 50-60 2920 5
4 M 60-70 2959 4
5 M 70-80 1460 2
6 M 80-90 730 1
7 V <40 730 1
8 V 50-60 1460 1
9 V 60-70 2555 4
10 V 70-80 1825 2
11 V 80-90 1095 2
数据
df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V",
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M",
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M",
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M",
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365,
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L,
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L,
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L,
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L,
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80",
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L,
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L,
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L,
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L,
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011",
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018",
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
library(data.table)
setDT(df)
df[, .(persondays_total = sum(persondays_individual),
contactfirst_total = sum(contactfirst_cat == 2019)),
keyby = .(Geslacht, age_cat)]
# Geslacht age_cat persondays_total contactfirst_total
# 1: M <40 1095 1
# 2: M 40-50 1095 3
# 3: M 50-60 2920 5
# 4: M 60-70 2959 4
# 5: M 70-80 1460 2
# 6: M 80-90 730 1
# 7: V <40 730 1
# 8: V 50-60 1460 1
# 9: V 60-70 2555 4
#10: V 70-80 1825 2
#11: V 80-90 1095 2
如果你不想打扰包裹,试试aggregate
。
aggregate(cbind(persondays_individual, contactfirst_cat) ~ age_cat + Geslacht,
transform(df, contactfirst_cat=contactfirst_cat == 2019), sum)
# age_cat Geslacht persondays_individual contactfirst_cat
# 1 <40 M 1095 1
# 2 40-50 M 1095 3
# 3 50-60 M 2920 5
# 4 60-70 M 2959 4
# 5 70-80 M 1460 2
# 6 80-90 M 730 1
# 7 <40 V 730 1
# 8 50-60 V 1460 1
# 9 60-70 V 2555 4
# 10 70-80 V 1825 2
# 11 80-90 V 1095 2
我无法解决以下问题。
Geslacht persondays age_cat contactfirst
1 V 365 <40 2020
2 V 365 <40 2019
3 V 365 70-80 2019
4 V 365 50-60 2019
5 V 365 60-70 2020
6 M 365 50-60 2020
7 V 365 60-70 2019
8 M 39 60-70 2019
9 V 365 60-70 2019
10 M 365 70-80 2020
df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V",
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M",
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M",
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M",
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365,
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L,
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L,
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L,
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L,
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80",
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L,
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L,
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L,
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L,
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011",
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018",
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
我想创建两个新变量。一个将每个 age_cat 和性别的人天数相加。第二个计算特定年份(在本例中为 2019 年)'contactfirst' 的数量
期望的输出:
Gender age_cat persondays_total contactfirst_total
V <40 730 1
V 50-60 365 1
V 60-70 1095 2
V 70-80 365 1
M 50-60 365 0
M 60-70 39 1
M 70-80 365 0
我已经尝试用 group_by
来做到这一点(代码如下)。但这计算了数据框中的所有人员日(因此不是按性别和 age_category)并且不会创建 newcontacts 列。
df2 <- df %>% group_by(Gender, age_cat) %>%
mutate(personyears_total = sum(persondays_individual)) %>%
mutate(newcontacts = nrow(df$contactfirst == "2019"
如有任何帮助,我们将不胜感激。
您应该使用 summarize
而不是 mutate
,因为您正在为每个组创建一些摘要。此外,使用 sum(contactfirst_cat == "2019")
来计算满足特定条件的记录,而不是 nrow()
,后者计算数据框中的行数。
library(dplyr)
df %>% group_by(Geslacht, age_cat) %>%
summarize(persondays_total = sum(persondays_individual),
contactfirst_total = sum(contactfirst_cat == "2019"),
.groups = "drop")
# A tibble: 11 × 4
Geslacht age_cat persondays_total contactfirst_total
<chr> <fct> <dbl> <int>
1 M <40 1095 1
2 M 40-50 1095 3
3 M 50-60 2920 5
4 M 60-70 2959 4
5 M 70-80 1460 2
6 M 80-90 730 1
7 V <40 730 1
8 V 50-60 1460 1
9 V 60-70 2555 4
10 V 70-80 1825 2
11 V 80-90 1095 2
数据
df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V",
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M",
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M",
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M",
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365,
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365,
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L,
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L,
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L,
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L,
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80",
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L,
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L,
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L,
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L,
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011",
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018",
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
library(data.table)
setDT(df)
df[, .(persondays_total = sum(persondays_individual),
contactfirst_total = sum(contactfirst_cat == 2019)),
keyby = .(Geslacht, age_cat)]
# Geslacht age_cat persondays_total contactfirst_total
# 1: M <40 1095 1
# 2: M 40-50 1095 3
# 3: M 50-60 2920 5
# 4: M 60-70 2959 4
# 5: M 70-80 1460 2
# 6: M 80-90 730 1
# 7: V <40 730 1
# 8: V 50-60 1460 1
# 9: V 60-70 2555 4
#10: V 70-80 1825 2
#11: V 80-90 1095 2
如果你不想打扰包裹,试试aggregate
。
aggregate(cbind(persondays_individual, contactfirst_cat) ~ age_cat + Geslacht,
transform(df, contactfirst_cat=contactfirst_cat == 2019), sum)
# age_cat Geslacht persondays_individual contactfirst_cat
# 1 <40 M 1095 1
# 2 40-50 M 1095 3
# 3 50-60 M 2920 5
# 4 60-70 M 2959 4
# 5 70-80 M 1460 2
# 6 80-90 M 730 1
# 7 <40 V 730 1
# 8 50-60 V 1460 1
# 9 60-70 V 2555 4
# 10 70-80 V 1825 2
# 11 80-90 V 1095 2