展开开始日期和退出日期之间的行并计算 R 中的天数

Expand rows between begin date and exit date and count the number of days in R

我有一个数据集,其中每一行都是一个部长 ID,对于每一行我都有日期(日、月、年)的列,部长进入政府(开始)和退出政府(退出) .这是关于它现在的样子的示例:

data <- structure(list(id_min = c("1030015", "1030028"), begin_day = c("29", 
"4"), begin_month = c("12", "1"), begin_year = c("2019", "2020"
), exit_day = c("3", "10"), exit_month = c("1", "1"), exit_year = c("2020", 
"2020")), row.names = c(NA, -2L), class = c("data.frame"))

我想扩展 "begin date" 和 "exit date" 之间的行,并创建一个新列 (number_days) 来计算部长在政府任职的天数。数据集中的其他变量(此处省略)应在数据的扩展版本中重复)。这是我正在寻找的数据框输出:

我能够以 %Y-%m-%d 的格式将日期放在一起,并使用以下代码创建新列 "number_days"

data$begin_date <- as.Date(with(data, paste(begin_year, begin_month, begin_day,sep="-")), "%Y-%m-%d")
data$exit_date <- as.Date(with(data, paste(exit_year, exit_month, exit_day,sep="-")), "%Y-%m-%d")
data$number_days <- data$exit_date - data$begin_date

但是我没有成功扩展两个日期(begin_date 和 exit_date)之间的行。我试图使用 tidyr::complete().

来做到这一点

一种方法是使用 uncount 函数:

library(dplyr)
library(tidyr)
library(lubridate)
data %>% 
  mutate(start_date = mdy(paste(begin_month,begin_day,begin_year,sep="-")),
         end_date = mdy(paste(exit_month,exit_day,exit_year,sep="-")),
         number_days = as.integer(end_date-start_date + 1)) %>%
  uncount(as.integer(number_days)) %>%
  group_by(id_min) %>%
  mutate(begin_day = day(seq(start_date[1], end_date[1], by = "days")),
         begin_month = month(seq(start_date[1], end_date[1], by = "days")),
         begin_year = year(seq(start_date[1], end_date[1], by = "days"))) %>%
  dplyr::select(-start_date, -end_date)
# A tibble: 13 x 8
# Groups:   id_min [2]
   id_min  begin_day begin_month begin_year exit_day exit_month exit_year number_days
   <chr>       <int>       <int>      <int> <chr>    <chr>      <chr>           <int>
 1 1030015        29          12       2019 3        1          2020                6
 2 1030015        30          12       2019 3        1          2020                6
 3 1030015        31          12       2019 3        1          2020                6
 4 1030015         1           1       2020 3        1          2020                6
 5 1030015         2           1       2020 3        1          2020                6
 6 1030015         3           1       2020 3        1          2020                6
 7 1030028         4           1       2020 10       1          2020                7
 8 1030028         5           1       2020 10       1          2020                7
 9 1030028         6           1       2020 10       1          2020                7
10 1030028         7           1       2020 10       1          2020                7
11 1030028         8           1       2020 10       1          2020                7
12 1030028         9           1       2020 10       1          2020                7
13 1030028        10           1       2020 10       1          2020                7

您也可以使用 tsibble 包中的 fill_gaps() 函数。

library(tsibble)
library(dplyr)
library(tidyr)

data %>%
  mutate(
    exit_date = as.Date(paste(exit_year, exit_month, exit_day, sep = "-")),
    begin_date = as.Date(paste(begin_year, begin_month, begin_day, sep = "-")),
    number_days = as.numeric(exit_date - begin_date) + 1
  ) %>%
  pivot_longer(cols = ends_with("date"), names_to = "event",values_to = "date") %>%
  as_tsibble(key = id_min, index = date) %>%
  fill_gaps() %>%
  mutate(begin_day = format(date, "%d"),
         begin_month = format(date, "%m"),
         begin_year = format(date, "%Y")) %>%
  as_tibble() %>%
  select(-event, -date) %>%
  fill(everything())

# A tibble: 13 x 8
   id_min  begin_day begin_month begin_year exit_day exit_month exit_year number_days
   <chr>   <chr>     <chr>       <chr>      <chr>    <chr>      <chr>           <dbl>
 1 1030015 29        12          2019       3        1          2020                6
 2 1030015 30        12          2019       3        1          2020                6
 3 1030015 31        12          2019       3        1          2020                6
 4 1030015 01        01          2020       3        1          2020                6
 5 1030015 02        01          2020       3        1          2020                6
 6 1030015 03        01          2020       3        1          2020                6
 7 1030028 04        01          2020       10       1          2020                7
 8 1030028 05        01          2020       10       1          2020                7
 9 1030028 06        01          2020       10       1          2020                7
10 1030028 07        01          2020       10       1          2020                7
11 1030028 08        01          2020       10       1          2020                7
12 1030028 09        01          2020       10       1          2020                7
13 1030028 10        01          2020       10       1          2020                7

一个tidyverse解决方案:

library(tidyverse)

df %>%
  #Combine begin columns to have start_day
  unite(start_day, begin_year, begin_month, begin_day, sep = '-', remove = FALSE) %>%
  #Combine exit columns to have end_day
  unite(end_day, exit_year, exit_month, exit_day, sep = '-', remove = FALSE) %>%
  #Convert to date
  mutate_at(vars(start_day, end_day), as.Date) %>%
  #Count number of days between them
  mutate(number_days = as.integer(end_day - start_day) + 1) %>%
  #Create a sequence between two dates 
  mutate(date = map2(start_day, end_day, seq, by = 'day')) %>%
  #Add the data in long format
  unnest(date) %>%
  #separate date into year, month and date.
  separate(date, c('begin_year', 'begin_month', 'begin_day'), sep = '-') %>%
  select(-start_day, -end_day)


# A tibble: 13 x 8
#   id_min  exit_day exit_month exit_year number_days begin_year begin_month begin_day
#   <chr>   <chr>    <chr>      <chr>           <dbl> <chr>      <chr>       <chr>    
# 1 1030015 3        1          2020                6 2019       12          29       
# 2 1030015 3        1          2020                6 2019       12          30       
# 3 1030015 3        1          2020                6 2019       12          31       
# 4 1030015 3        1          2020                6 2020       01          01       
# 5 1030015 3        1          2020                6 2020       01          02       
# 6 1030015 3        1          2020                6 2020       01          03       
# 7 1030028 10       1          2020                7 2020       01          04       
# 8 1030028 10       1          2020                7 2020       01          05       
# 9 1030028 10       1          2020                7 2020       01          06       
#10 1030028 10       1          2020                7 2020       01          07       
#11 1030028 10       1          2020                7 2020       01          08       
#12 1030028 10       1          2020                7 2020       01          09       
#13 1030028 10       1          2020                7 2020       01          10