基于两个标准求和和计数值

Question

我无法解决以下问题。

Geslacht               persondays age_cat contactfirst         
 1 V                          365 <40     2020            
 2 V                          365 <40     2019            
 3 V                          365 70-80   2019            
 4 V                          365 50-60   2019            
 5 V                          365 60-70   2020            
 6 M                          365 50-60   2020            
 7 V                          365 60-70   2019            
 8 M                           39 60-70   2019            
 9 V                          365 60-70   2019            
10 M                          365 70-80   2020            

df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V", 
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M", 
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M", 
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M", 
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365, 
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L, 
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L, 
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L, 
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L, 
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80", 
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L, 
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L, 
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L, 
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"))

我想创建两个新变量。一个将每个 age_cat 和性别的人天数相加。第二个计算特定年份（在本例中为 2019 年）'contactfirst' 的数量

期望的输出：

Gender       age_cat    persondays_total    contactfirst_total
  V            <40           730            1
  V            50-60         365            1
  V            60-70         1095           2
  V            70-80         365            1
  M            50-60         365            0
  M            60-70         39             1
  M            70-80         365            0

我已经尝试用 group_by 来做到这一点（代码如下）。但这计算了数据框中的所有人员日（因此不是按性别和 age_category）并且不会创建 newcontacts 列。

df2 <- df %>% group_by(Gender, age_cat) %>%
         mutate(personyears_total = sum(persondays_individual)) %>%
         mutate(newcontacts = nrow(df$contactfirst == "2019"

如有任何帮助，我们将不胜感激。

Answer 1

您应该使用 summarize 而不是 mutate，因为您正在为每个组创建一些摘要。此外，使用 sum(contactfirst_cat == "2019") 来计算满足特定条件的记录，而不是 nrow()，后者计算数据框中的行数。

library(dplyr)

df %>% group_by(Geslacht, age_cat) %>% 
  summarize(persondays_total = sum(persondays_individual),
            contactfirst_total = sum(contactfirst_cat == "2019"),
            .groups = "drop")

# A tibble: 11 × 4
   Geslacht age_cat persondays_total contactfirst_total
   <chr>    <fct>              <dbl>              <int>
 1 M        <40                 1095                  1
 2 M        40-50               1095                  3
 3 M        50-60               2920                  5
 4 M        60-70               2959                  4
 5 M        70-80               1460                  2
 6 M        80-90                730                  1
 7 V        <40                  730                  1
 8 V        50-60               1460                  1
 9 V        60-70               2555                  4
10 V        70-80               1825                  2
11 V        80-90               1095                  2

数据

df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V", 
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M", 
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M", 
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M", 
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365, 
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L, 
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L, 
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L, 
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L, 
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80", 
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L, 
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L, 
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L, 
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"))

Answer 2

library(data.table)
setDT(df)
df[, .(persondays_total = sum(persondays_individual),
       contactfirst_total = sum(contactfirst_cat == 2019)),
     keyby = .(Geslacht, age_cat)]
#    Geslacht age_cat persondays_total contactfirst_total
# 1:        M     <40             1095                  1
# 2:        M   40-50             1095                  3
# 3:        M   50-60             2920                  5
# 4:        M   60-70             2959                  4
# 5:        M   70-80             1460                  2
# 6:        M   80-90              730                  1
# 7:        V     <40              730                  1
# 8:        V   50-60             1460                  1
# 9:        V   60-70             2555                  4
#10:        V   70-80             1825                  2
#11:        V   80-90             1095                  2

Answer 3

如果你不想打扰包裹，试试aggregate。

aggregate(cbind(persondays_individual, contactfirst_cat) ~ age_cat + Geslacht, 
          transform(df, contactfirst_cat=contactfirst_cat == 2019), sum)
#    age_cat Geslacht persondays_individual contactfirst_cat
# 1      <40        M                  1095                1
# 2    40-50        M                  1095                3
# 3    50-60        M                  2920                5
# 4    60-70        M                  2959                4
# 5    70-80        M                  1460                2
# 6    80-90        M                   730                1
# 7      <40        V                   730                1
# 8    50-60        V                  1460                1
# 9    60-70        V                  2555                4
# 10   70-80        V                  1825                2
# 11   80-90        V                  1095                2

基于两个标准求和和计数值

sum and count value based on two criteria

r

sum

count

conditional-statements

数据