在 R 中的 lubridate 中处理大型数据集中日期的计算有效方法
Computationally efficient ways for working with dates in large datasets in lubridate in R
我有这样的数据,但有 2000 万行。
library(tidyr)
library(dplyr)
library(stringr)
library(magrittr)
library(lubridate)
library(tidyverse)
df <- data.frame(
DATE_OF_BIRTH = c("1933-03-31", "1947-06-25", "1901-09-02", "1952-01-22", "1936-07-18", "2020-10-22", "1930-05-18", "1926-05-13"),
DATE_OF_DEATH = c(NA, "2019-02-04", "2017-10-27", NA, "2021-01-03", NA, NA, NA),
)
我想做的是
A) 计算截至 2019 年 12 月 31 日人们的年龄;并将他们按年龄分组
B) 删除年龄或死亡日期不可能的人
这是我的代码 运行 这样做
#Change the missing dates of death into a format recognisable as a date, which is far into the future
df %<>%
replace_na(list(DATE_OF_DEATH = "01/01/9999"))
#Specify the start and end date of the year of interest
end_yr_date = dmy('31/12/2019')
start_yr_date = dmy('01/01/2019')
df %<>%
#create age
mutate(age = floor(interval(start = dmy(DATE_OF_BIRTH), end = end_yr_date) /
duration(num = 1, units = "years"))) %>%
#and age groupings
mutate(age_group = cut(age,
breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 150),
labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54","55-59", "60-64", "65-69",
"70-74", "75-79", "80-84", "85+"), right = FALSE))
df %<>%
#remove people who were born after end date
filter(!(dmy(DATE_OF_BIRTH) > end_yr_date)) %>%
#remove people who died before start date
filter(!(dmy(DATE_OF_DEATH) < start_yr_date)) %>%
#Remove people with a negative age
filter(age >= 0) %>%
#Remove people older than 115
filter(age < 116)
这在此示例数据集上运行良好,但它仅在 2000 万行数据上保留 运行 和 运行 以及 运行。我想知道是否有处理计算效率更高、速度更快的日期的方法?
我也想知道我是否有无法解析的日期格式(我已经删除了 NA 日期,但可能还有其他格式不正确的数据输入错误)这就是代码的原因只是保持 运行。有谁知道一种有效的方法来确定任何不会解析的日期格式(不是 NA)?
感谢您的帮助。
您可以将列更改为 class 一次并将所有 filter
表达式包含在一个列中。
library(dplyr)
library(lubridate)
df %>%
mutate(across(c(DATE_OF_BIRTH, DATE_OF_DEATH), ymd),
age = floor(interval(start = DATE_OF_BIRTH, end = end_yr_date) /
duration(num = 1, units = "years")),
age_group = cut(age,
breaks = c(seq(0, 85, 5), 150),
labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54","55-59", "60-64", "65-69",
"70-74", "75-79", "80-84", "85+"), right = FALSE)) %>%
filter(DATE_OF_BIRTH < end_yr_date, DATE_OF_DEATH > start_yr_date,
between(age, 0, 116)) -> result
如果仍然很慢,您可以切换到 data.table
。
library(data.table)
setDT(df)
df[, c('DATE_OF_BIRTH', 'DATE_OF_DEATH') := lapply(.SD, ymd), .SDcols = c('DATE_OF_BIRTH', 'DATE_OF_DEATH')] %>%
.[, age := floor(interval(start = DATE_OF_BIRTH, end = end_yr_date) /
duration(num = 1, units = "years"))] %>%
.[, age_group := cut(age,
breaks = c(seq(0, 85, 5), 150),
labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54","55-59", "60-64", "65-69",
"70-74", "75-79", "80-84", "85+"), right = FALSE)] %>%
.[DATE_OF_BIRTH < end_yr_date & DATE_OF_DEATH > start_yr_date & between(age, 0, 116)] -> result
我有这样的数据,但有 2000 万行。
library(tidyr)
library(dplyr)
library(stringr)
library(magrittr)
library(lubridate)
library(tidyverse)
df <- data.frame(
DATE_OF_BIRTH = c("1933-03-31", "1947-06-25", "1901-09-02", "1952-01-22", "1936-07-18", "2020-10-22", "1930-05-18", "1926-05-13"),
DATE_OF_DEATH = c(NA, "2019-02-04", "2017-10-27", NA, "2021-01-03", NA, NA, NA),
)
我想做的是
A) 计算截至 2019 年 12 月 31 日人们的年龄;并将他们按年龄分组
B) 删除年龄或死亡日期不可能的人
这是我的代码 运行 这样做
#Change the missing dates of death into a format recognisable as a date, which is far into the future
df %<>%
replace_na(list(DATE_OF_DEATH = "01/01/9999"))
#Specify the start and end date of the year of interest
end_yr_date = dmy('31/12/2019')
start_yr_date = dmy('01/01/2019')
df %<>%
#create age
mutate(age = floor(interval(start = dmy(DATE_OF_BIRTH), end = end_yr_date) /
duration(num = 1, units = "years"))) %>%
#and age groupings
mutate(age_group = cut(age,
breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 150),
labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54","55-59", "60-64", "65-69",
"70-74", "75-79", "80-84", "85+"), right = FALSE))
df %<>%
#remove people who were born after end date
filter(!(dmy(DATE_OF_BIRTH) > end_yr_date)) %>%
#remove people who died before start date
filter(!(dmy(DATE_OF_DEATH) < start_yr_date)) %>%
#Remove people with a negative age
filter(age >= 0) %>%
#Remove people older than 115
filter(age < 116)
这在此示例数据集上运行良好,但它仅在 2000 万行数据上保留 运行 和 运行 以及 运行。我想知道是否有处理计算效率更高、速度更快的日期的方法?
我也想知道我是否有无法解析的日期格式(我已经删除了 NA 日期,但可能还有其他格式不正确的数据输入错误)这就是代码的原因只是保持 运行。有谁知道一种有效的方法来确定任何不会解析的日期格式(不是 NA)?
感谢您的帮助。
您可以将列更改为 class 一次并将所有 filter
表达式包含在一个列中。
library(dplyr)
library(lubridate)
df %>%
mutate(across(c(DATE_OF_BIRTH, DATE_OF_DEATH), ymd),
age = floor(interval(start = DATE_OF_BIRTH, end = end_yr_date) /
duration(num = 1, units = "years")),
age_group = cut(age,
breaks = c(seq(0, 85, 5), 150),
labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54","55-59", "60-64", "65-69",
"70-74", "75-79", "80-84", "85+"), right = FALSE)) %>%
filter(DATE_OF_BIRTH < end_yr_date, DATE_OF_DEATH > start_yr_date,
between(age, 0, 116)) -> result
如果仍然很慢,您可以切换到 data.table
。
library(data.table)
setDT(df)
df[, c('DATE_OF_BIRTH', 'DATE_OF_DEATH') := lapply(.SD, ymd), .SDcols = c('DATE_OF_BIRTH', 'DATE_OF_DEATH')] %>%
.[, age := floor(interval(start = DATE_OF_BIRTH, end = end_yr_date) /
duration(num = 1, units = "years"))] %>%
.[, age_group := cut(age,
breaks = c(seq(0, 85, 5), 150),
labels = c("00-04",'05-09','10-14',"15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54","55-59", "60-64", "65-69",
"70-74", "75-79", "80-84", "85+"), right = FALSE)] %>%
.[DATE_OF_BIRTH < end_yr_date & DATE_OF_DEATH > start_yr_date & between(age, 0, 116)] -> result