展开开始日期和退出日期之间的行并计算 R 中的天数
Expand rows between begin date and exit date and count the number of days in R
我有一个数据集,其中每一行都是一个部长 ID,对于每一行我都有日期(日、月、年)的列,部长进入政府(开始)和退出政府(退出) .这是关于它现在的样子的示例:
data <- structure(list(id_min = c("1030015", "1030028"), begin_day = c("29",
"4"), begin_month = c("12", "1"), begin_year = c("2019", "2020"
), exit_day = c("3", "10"), exit_month = c("1", "1"), exit_year = c("2020",
"2020")), row.names = c(NA, -2L), class = c("data.frame"))
我想扩展 "begin date" 和 "exit date" 之间的行,并创建一个新列 (number_days) 来计算部长在政府任职的天数。数据集中的其他变量(此处省略)应在数据的扩展版本中重复)。这是我正在寻找的数据框输出:
我能够以 %Y-%m-%d 的格式将日期放在一起,并使用以下代码创建新列 "number_days"
data$begin_date <- as.Date(with(data, paste(begin_year, begin_month, begin_day,sep="-")), "%Y-%m-%d")
data$exit_date <- as.Date(with(data, paste(exit_year, exit_month, exit_day,sep="-")), "%Y-%m-%d")
data$number_days <- data$exit_date - data$begin_date
但是我没有成功扩展两个日期(begin_date 和 exit_date)之间的行。我试图使用 tidyr::complete()
.
来做到这一点
一种方法是使用 uncount
函数:
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(start_date = mdy(paste(begin_month,begin_day,begin_year,sep="-")),
end_date = mdy(paste(exit_month,exit_day,exit_year,sep="-")),
number_days = as.integer(end_date-start_date + 1)) %>%
uncount(as.integer(number_days)) %>%
group_by(id_min) %>%
mutate(begin_day = day(seq(start_date[1], end_date[1], by = "days")),
begin_month = month(seq(start_date[1], end_date[1], by = "days")),
begin_year = year(seq(start_date[1], end_date[1], by = "days"))) %>%
dplyr::select(-start_date, -end_date)
# A tibble: 13 x 8
# Groups: id_min [2]
id_min begin_day begin_month begin_year exit_day exit_month exit_year number_days
<chr> <int> <int> <int> <chr> <chr> <chr> <int>
1 1030015 29 12 2019 3 1 2020 6
2 1030015 30 12 2019 3 1 2020 6
3 1030015 31 12 2019 3 1 2020 6
4 1030015 1 1 2020 3 1 2020 6
5 1030015 2 1 2020 3 1 2020 6
6 1030015 3 1 2020 3 1 2020 6
7 1030028 4 1 2020 10 1 2020 7
8 1030028 5 1 2020 10 1 2020 7
9 1030028 6 1 2020 10 1 2020 7
10 1030028 7 1 2020 10 1 2020 7
11 1030028 8 1 2020 10 1 2020 7
12 1030028 9 1 2020 10 1 2020 7
13 1030028 10 1 2020 10 1 2020 7
您也可以使用 tsibble
包中的 fill_gaps()
函数。
library(tsibble)
library(dplyr)
library(tidyr)
data %>%
mutate(
exit_date = as.Date(paste(exit_year, exit_month, exit_day, sep = "-")),
begin_date = as.Date(paste(begin_year, begin_month, begin_day, sep = "-")),
number_days = as.numeric(exit_date - begin_date) + 1
) %>%
pivot_longer(cols = ends_with("date"), names_to = "event",values_to = "date") %>%
as_tsibble(key = id_min, index = date) %>%
fill_gaps() %>%
mutate(begin_day = format(date, "%d"),
begin_month = format(date, "%m"),
begin_year = format(date, "%Y")) %>%
as_tibble() %>%
select(-event, -date) %>%
fill(everything())
# A tibble: 13 x 8
id_min begin_day begin_month begin_year exit_day exit_month exit_year number_days
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 1030015 29 12 2019 3 1 2020 6
2 1030015 30 12 2019 3 1 2020 6
3 1030015 31 12 2019 3 1 2020 6
4 1030015 01 01 2020 3 1 2020 6
5 1030015 02 01 2020 3 1 2020 6
6 1030015 03 01 2020 3 1 2020 6
7 1030028 04 01 2020 10 1 2020 7
8 1030028 05 01 2020 10 1 2020 7
9 1030028 06 01 2020 10 1 2020 7
10 1030028 07 01 2020 10 1 2020 7
11 1030028 08 01 2020 10 1 2020 7
12 1030028 09 01 2020 10 1 2020 7
13 1030028 10 01 2020 10 1 2020 7
一个tidyverse
解决方案:
library(tidyverse)
df %>%
#Combine begin columns to have start_day
unite(start_day, begin_year, begin_month, begin_day, sep = '-', remove = FALSE) %>%
#Combine exit columns to have end_day
unite(end_day, exit_year, exit_month, exit_day, sep = '-', remove = FALSE) %>%
#Convert to date
mutate_at(vars(start_day, end_day), as.Date) %>%
#Count number of days between them
mutate(number_days = as.integer(end_day - start_day) + 1) %>%
#Create a sequence between two dates
mutate(date = map2(start_day, end_day, seq, by = 'day')) %>%
#Add the data in long format
unnest(date) %>%
#separate date into year, month and date.
separate(date, c('begin_year', 'begin_month', 'begin_day'), sep = '-') %>%
select(-start_day, -end_day)
# A tibble: 13 x 8
# id_min exit_day exit_month exit_year number_days begin_year begin_month begin_day
# <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
# 1 1030015 3 1 2020 6 2019 12 29
# 2 1030015 3 1 2020 6 2019 12 30
# 3 1030015 3 1 2020 6 2019 12 31
# 4 1030015 3 1 2020 6 2020 01 01
# 5 1030015 3 1 2020 6 2020 01 02
# 6 1030015 3 1 2020 6 2020 01 03
# 7 1030028 10 1 2020 7 2020 01 04
# 8 1030028 10 1 2020 7 2020 01 05
# 9 1030028 10 1 2020 7 2020 01 06
#10 1030028 10 1 2020 7 2020 01 07
#11 1030028 10 1 2020 7 2020 01 08
#12 1030028 10 1 2020 7 2020 01 09
#13 1030028 10 1 2020 7 2020 01 10
我有一个数据集,其中每一行都是一个部长 ID,对于每一行我都有日期(日、月、年)的列,部长进入政府(开始)和退出政府(退出) .这是关于它现在的样子的示例:
data <- structure(list(id_min = c("1030015", "1030028"), begin_day = c("29",
"4"), begin_month = c("12", "1"), begin_year = c("2019", "2020"
), exit_day = c("3", "10"), exit_month = c("1", "1"), exit_year = c("2020",
"2020")), row.names = c(NA, -2L), class = c("data.frame"))
我想扩展 "begin date" 和 "exit date" 之间的行,并创建一个新列 (number_days) 来计算部长在政府任职的天数。数据集中的其他变量(此处省略)应在数据的扩展版本中重复)。这是我正在寻找的数据框输出:
我能够以 %Y-%m-%d 的格式将日期放在一起,并使用以下代码创建新列 "number_days"
data$begin_date <- as.Date(with(data, paste(begin_year, begin_month, begin_day,sep="-")), "%Y-%m-%d")
data$exit_date <- as.Date(with(data, paste(exit_year, exit_month, exit_day,sep="-")), "%Y-%m-%d")
data$number_days <- data$exit_date - data$begin_date
但是我没有成功扩展两个日期(begin_date 和 exit_date)之间的行。我试图使用 tidyr::complete()
.
一种方法是使用 uncount
函数:
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(start_date = mdy(paste(begin_month,begin_day,begin_year,sep="-")),
end_date = mdy(paste(exit_month,exit_day,exit_year,sep="-")),
number_days = as.integer(end_date-start_date + 1)) %>%
uncount(as.integer(number_days)) %>%
group_by(id_min) %>%
mutate(begin_day = day(seq(start_date[1], end_date[1], by = "days")),
begin_month = month(seq(start_date[1], end_date[1], by = "days")),
begin_year = year(seq(start_date[1], end_date[1], by = "days"))) %>%
dplyr::select(-start_date, -end_date)
# A tibble: 13 x 8
# Groups: id_min [2]
id_min begin_day begin_month begin_year exit_day exit_month exit_year number_days
<chr> <int> <int> <int> <chr> <chr> <chr> <int>
1 1030015 29 12 2019 3 1 2020 6
2 1030015 30 12 2019 3 1 2020 6
3 1030015 31 12 2019 3 1 2020 6
4 1030015 1 1 2020 3 1 2020 6
5 1030015 2 1 2020 3 1 2020 6
6 1030015 3 1 2020 3 1 2020 6
7 1030028 4 1 2020 10 1 2020 7
8 1030028 5 1 2020 10 1 2020 7
9 1030028 6 1 2020 10 1 2020 7
10 1030028 7 1 2020 10 1 2020 7
11 1030028 8 1 2020 10 1 2020 7
12 1030028 9 1 2020 10 1 2020 7
13 1030028 10 1 2020 10 1 2020 7
您也可以使用 tsibble
包中的 fill_gaps()
函数。
library(tsibble)
library(dplyr)
library(tidyr)
data %>%
mutate(
exit_date = as.Date(paste(exit_year, exit_month, exit_day, sep = "-")),
begin_date = as.Date(paste(begin_year, begin_month, begin_day, sep = "-")),
number_days = as.numeric(exit_date - begin_date) + 1
) %>%
pivot_longer(cols = ends_with("date"), names_to = "event",values_to = "date") %>%
as_tsibble(key = id_min, index = date) %>%
fill_gaps() %>%
mutate(begin_day = format(date, "%d"),
begin_month = format(date, "%m"),
begin_year = format(date, "%Y")) %>%
as_tibble() %>%
select(-event, -date) %>%
fill(everything())
# A tibble: 13 x 8
id_min begin_day begin_month begin_year exit_day exit_month exit_year number_days
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 1030015 29 12 2019 3 1 2020 6
2 1030015 30 12 2019 3 1 2020 6
3 1030015 31 12 2019 3 1 2020 6
4 1030015 01 01 2020 3 1 2020 6
5 1030015 02 01 2020 3 1 2020 6
6 1030015 03 01 2020 3 1 2020 6
7 1030028 04 01 2020 10 1 2020 7
8 1030028 05 01 2020 10 1 2020 7
9 1030028 06 01 2020 10 1 2020 7
10 1030028 07 01 2020 10 1 2020 7
11 1030028 08 01 2020 10 1 2020 7
12 1030028 09 01 2020 10 1 2020 7
13 1030028 10 01 2020 10 1 2020 7
一个tidyverse
解决方案:
library(tidyverse)
df %>%
#Combine begin columns to have start_day
unite(start_day, begin_year, begin_month, begin_day, sep = '-', remove = FALSE) %>%
#Combine exit columns to have end_day
unite(end_day, exit_year, exit_month, exit_day, sep = '-', remove = FALSE) %>%
#Convert to date
mutate_at(vars(start_day, end_day), as.Date) %>%
#Count number of days between them
mutate(number_days = as.integer(end_day - start_day) + 1) %>%
#Create a sequence between two dates
mutate(date = map2(start_day, end_day, seq, by = 'day')) %>%
#Add the data in long format
unnest(date) %>%
#separate date into year, month and date.
separate(date, c('begin_year', 'begin_month', 'begin_day'), sep = '-') %>%
select(-start_day, -end_day)
# A tibble: 13 x 8
# id_min exit_day exit_month exit_year number_days begin_year begin_month begin_day
# <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
# 1 1030015 3 1 2020 6 2019 12 29
# 2 1030015 3 1 2020 6 2019 12 30
# 3 1030015 3 1 2020 6 2019 12 31
# 4 1030015 3 1 2020 6 2020 01 01
# 5 1030015 3 1 2020 6 2020 01 02
# 6 1030015 3 1 2020 6 2020 01 03
# 7 1030028 10 1 2020 7 2020 01 04
# 8 1030028 10 1 2020 7 2020 01 05
# 9 1030028 10 1 2020 7 2020 01 06
#10 1030028 10 1 2020 7 2020 01 07
#11 1030028 10 1 2020 7 2020 01 08
#12 1030028 10 1 2020 7 2020 01 09
#13 1030028 10 1 2020 7 2020 01 10