在 R 中的 lubridate 中处理大型数据集中日期的计算有效方法

Computationally efficient ways for working with dates in large datasets in lubridate in R

我有这样的数据,但有 2000 万行。

library(tidyr)
library(dplyr)
library(stringr)
library(magrittr)
library(lubridate)
library(tidyverse)


df <- data.frame(
  DATE_OF_BIRTH = c("1933-03-31", "1947-06-25", "1901-09-02", "1952-01-22", "1936-07-18", "2020-10-22", "1930-05-18", "1926-05-13"),
  DATE_OF_DEATH = c(NA, "2019-02-04", "2017-10-27", NA, "2021-01-03", NA, NA, NA),
) 

我想做的是

A) 计算截至 2019 年 12 月 31 日人们的年龄;并将他们按年龄分组

B) 删除年龄或死亡日期不可能的人

这是我的代码 运行 这样做

#Change the missing dates of death into a format recognisable as a date, which is far into the future
df %<>%
  replace_na(list(DATE_OF_DEATH = "01/01/9999"))

#Specify the start and end date of the year of interest
end_yr_date = dmy('31/12/2019')
start_yr_date = dmy('01/01/2019')

df %<>%
  #create age
  mutate(age = floor(interval(start = dmy(DATE_OF_BIRTH), end = end_yr_date) / 
                       duration(num = 1, units = "years"))) %>%
  #and age groupings
  mutate(age_group = cut(age, 
                         breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 150), 
                         labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44", 
                                    "45-49", "50-54","55-59", "60-64", "65-69",
                                    "70-74", "75-79", "80-84", "85+"), right = FALSE))


df %<>%
  #remove people who were born after end date
  filter(!(dmy(DATE_OF_BIRTH) > end_yr_date)) %>%
  #remove people who died before start date
  filter(!(dmy(DATE_OF_DEATH) < start_yr_date)) %>%
  #Remove people with a negative age
  filter(age >= 0) %>%
  #Remove people older than 115
  filter(age < 116)

这在此示例数据集上运行良好,但它仅在 2000 万行数据上保留 运行 和 运行 以及 运行。我想知道是否有处理计算效率更高、速度更快的日期的方法?

我也想知道我是否有无法解析的日期格式(我已经删除了 NA 日期,但可能还有其他格式不正确的数据输入错误)这就是代码的原因只是保持 运行。有谁知道一种有效的方法来确定任何不会解析的日期格式(不是 NA)?

感谢您的帮助。

您可以将列更改为 class 一次并将所有 filter 表达式包含在一个列中。

library(dplyr)
library(lubridate)

df %>%
  mutate(across(c(DATE_OF_BIRTH, DATE_OF_DEATH), ymd),
         age = floor(interval(start = DATE_OF_BIRTH, end = end_yr_date) / 
                       duration(num = 1, units = "years")), 
         age_group = cut(age, 
                         breaks = c(seq(0, 85, 5), 150), 
                         labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44", 
                                    "45-49", "50-54","55-59", "60-64", "65-69",
                                    "70-74", "75-79", "80-84", "85+"), right = FALSE)) %>%
  filter(DATE_OF_BIRTH < end_yr_date, DATE_OF_DEATH > start_yr_date, 
         between(age, 0, 116)) -> result

如果仍然很慢,您可以切换到 data.table

library(data.table)

setDT(df)
df[, c('DATE_OF_BIRTH', 'DATE_OF_DEATH') := lapply(.SD, ymd), .SDcols = c('DATE_OF_BIRTH', 'DATE_OF_DEATH')] %>%
  .[, age := floor(interval(start = DATE_OF_BIRTH, end = end_yr_date) / 
                duration(num = 1, units = "years"))] %>%
  .[, age_group := cut(age, 
                breaks = c(seq(0, 85, 5), 150), 
                labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44", 
                           "45-49", "50-54","55-59", "60-64", "65-69",
                           "70-74", "75-79", "80-84", "85+"), right = FALSE)] %>%
  .[DATE_OF_BIRTH < end_yr_date & DATE_OF_DEATH > start_yr_date & between(age, 0, 116)] -> result