按月分组的年龄组

Age groups into monthly buckets

我正在努力寻找以下问题的解决方案。我有一个 id's/ dob's 的 df 和另一个 monthbucket df 如下


set.seed(33)

df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
                 id = seq(1:10) )


monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
                          startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
                          endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)

我想得到一个输出,它为我的每个月存储桶提供年龄组 (<19, 19-64, >64) 内的成员计数。人过生日的那一年明显倒数。

我用类似的方法计算了年龄:

age.fct <- function(dob, bucketdate) {

  period <- as.period(interval(dob, bucketdate),unit = "year")
  period$year}

我想一般的方法是计算每个月桶的年龄,分配到 3 age groups 之一并按月计算。有什么建议吗?

编辑 1.

感谢所有不同的方法,我只是 运行 一个关于解决方案的简短基准以确定接受哪个答案。不知何故,数据 table 解决方案在我的测试数据集上不起作用,但我会在接下来的几天内尽快检查。

set.seed(33)

df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
                 id = seq(1:10000) )


monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
                          startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
                          endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)


birth_days <- df$dob
month_bucket <- monthbucket$startmonth

和基准


microbenchmark::microbenchmark(
  MM=  monthbucket %>% group_by_all %>% expand(id=df$id) %>%  left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>%  mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>% 
    mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>%  group_by(month) %>% count(age_cat) %>%  gather(variable, count, n) %>%
    unite(variable, age_cat) %>% spread(variable, count)
  ,
  AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
  ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
  ages <- sapply(ages, table)
  colnames(ages) <- monthbucket$month
  },
  Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
   },
  # cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
  # },
  # 
  Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
  },

  Cole4={all_combos <- expand.grid(month_bucket =  month_bucket, birth_days = birth_days) 
  all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
  all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
  reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide'  )
},
times = 1L)

Unit: milliseconds
   expr        min         lq       mean     median         uq        max neval
     MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810     1
 AkselA   17.12697   17.12697   17.12697   17.12697   17.12697   17.12697     1
  Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534     1
  Cole3   23.63945   23.63945   23.63945   23.63945   23.63945   23.63945     1
  Cole4  877.92782  877.92782  877.92782  877.92782  877.92782  877.92782     1

基于速度,AkselA 的方法似乎是最快的,但与所有其他方法相比,M-M 的方法得到了不同的结果(一旦 AkselA 在剪切部分更改为 65 cut, c(0, 19, 64, Inf)..。我会接受答案基于速度,但会研究结果的差异!

不是很复杂但是我加入了两个表(先在df$id上扩展monthbucket)然后计算年龄(因为你有整个月,我只是计算difftime与出生月份的第一天和 startmonth)。然后,对于每个月(桶),我计算了不同年龄组的数量,最后将长格式转换为宽格式以便更好地说明。

library(lubridate)
library(tidyverse)

monthbucket %>% 
  group_by_all %>% 
  expand(id=df$id) %>% 
  left_join(.,{df %>%
                mutate(birth_month =cut(dob, "month"))},
            by="id") %>% 
  mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>% 
  mutate(age_cat=case_when(age<19 ~ "<19",
                           age>64 ~ ">64",
                           TRUE ~ "19-64")) %>% 
  group_by(month) %>% 
  count(age_cat) %>% 
  gather(variable, count, n) %>%
  unite(variable, age_cat) %>% 
  spread(variable, count)

#> # A tibble: 13 x 4
#> # Groups:   month [13]
#>    month   `<19` `>64` `19-64`
#>    <fct>   <int> <int>   <int>
#>  1 2010-01     3     2       5
#>  2 2010-02     3     2       5
#>  3 2010-03     3     2       5
#>  4 2010-04     3     2       5
#>  5 2010-05     3     2       5
#>  6 2010-06     3     2       5
#>  7 2010-07     3     2       5
#>  8 2010-08     3     2       5
#>  9 2010-09     3     2       5
#> 10 2010-10     3     2       5
#> 11 2010-11     3     2       5
#> 12 2010-12     3     2       5
#> 13 2011-01     3     2       5

reprex package (v0.3.0)

于 2019-07-03 创建

假设我理解你的要求。

ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))

ages <- do.call(data.frame, 
  lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))

ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
#       2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19        2       2       2       2       2       2       2       2       2       2       2       2       2
# 19-64       7       7       7       7       7       7       7       7       7       7       7       7       7
# 64+         1       1       1       1       1       1       1       1       1       1       1       1       1
# 

@AkselA 的回答有一些相似之处,因为它取决于 outer()cut()table()

set.seed(33)
birth_days <- sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10)
month_bucket <- seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months")

t(
  table(
    apply(
      X = outer(month_bucket, birth_days, `-`) / 365.25
      , MARGIN = 2
      , FUN = cut, c(0,19,65, Inf)
    )
    , rep(format(month_bucket,'%Y-%m'), length(birth_days))
  )
)

          (0,19] (19,65] (65,Inf]
  2010-01      2       7        1
  2010-02      2       7        1
  2010-03      2       7        1
  2010-04      2       7        1
  2010-05      2       7        1
  2010-06      2       7        1
  2010-07      2       7        1
  2010-08      2       7        1
  2010-09      2       7        1
  2010-10      2       7        1
  2010-11      2       7        1
  2010-12      2       7        1
  2011-01      2       7        1

我觉得有这样一个类似的解决方案很奇怪所以这里是 data.table:

library(data.table)

dcast(
  CJ(month_bucket, birth_days
   )[, .N
     , by = .(month_bucket
              , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))
     ]
  , month_bucket ~ cut
  , value.var = 'N')

dplyrtidyr:

library(dplyr)
library(tidyr)

crossing(month_bucket, birth_days)%>%
  count(month_bucket
        , age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
        )%>%
  spread(age_range, n)

我不太满意 base 中的类似方法。

all_combos <- expand.grid(month_bucket =  month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))

reshape(
  data = aggregate(
    all_combos$month_bucket
    , by = list(bucket = all_combos$month_bucket
                ,age_group = all_combos$cut_r)
    , FUN = length)
  , timevar = 'age_group'
  , idvar = 'bucket'
  , direction = 'wide'
)