创建新 data.frame 考虑期间的虚拟变量和第三个变量
Creating new data.frame considering a dummy variable for period and a third variable
使用此数据(*更正)
structure(list(Date = structure(c(1461, 1826, 2191, 2557, 2922,
3287, 3652, 4018, 4383, 4748, 5113, 5479, 5844, 6209, 6574, 6940,
7305, 7670, 8035, 8401, 8766, 9131, 9496, 9862, 10227, 10592,
10957, 11323, 11688, 12053, 12418, 12784, 13149, 13514, 13879,
14245, 14610, 14975, 15340, 15706, 16071, 16436, 16801, 17167,
17532, 17897), class = "Date"), State = c(1, 1, 1, 1, 1, 0, 0,
1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), Duration = c("",
"", "", "", "5", "", "2", "1", "", "", "3", "", "2", "", "2",
"1", "1", "1", "", "", "3", "", "", "", "4", "", "", "", "",
"", "", "", "", "", "10", "1", "", "2", "", "", "", "", "", "",
"", "8")), row.names = c(NA, -46L), class = "data.frame")
我想达到这个新 data.frame:
newdf <- data.frame(State = c("Expansion", "Contraction", "Expansion", "Contraction", "Expansion", "Contraction",
"Expansion", "Contraction", "Expansion", "Contraction", "Expansion", "Contraction",
"Expansion", "Contraction", "Expansion"),
Duration = c("5", "2", "1", "3", "2", "2", "1", "1", "1", "3", "4", "10", "1", "2", "8"),
Period = c("1974 - 1978", "1979 - 1980", "1981", "1982 - 1984", "1985 - 1986", "1987 - 1988", "1989",
"1990", "1991", "1992 - 1994", "1995 - 1998", "1999 - 2008", "2009", "2010 - 2011", "2012 - 2019"))
虚拟值等于 1 会 return“扩张”(零,“收缩”)。我想 Duration 列可用于获取初始日期和最终日期。
这里有一个 tidyverse
的选项。将 'Date' 转换为 Date
class,从 'Date' 中提取 year
,根据 [=25= 中出现的数值创建分组列],按 'grp' 分组后,summarise
取 'Duration' 的 last
值,同时 paste
对 first
和 last
的 'year' 如果 'grp' 中有多行,否则 return first
'year' 并且类似地,将 'State' 更改为单值 'Expansion/Contraction' 如果 all
'State' 是 1 或 0
library(dplyr)
library(lubridate)
library(stringr)
df %>%
mutate(Date = as.Date(Date),
year = year(Date),
grp = lag(cumsum(str_detect(Duration, "\d+")), default = 0)) %>%
group_by(grp) %>%
summarise(Duration = last(Duration),
Period =if(n() > 1) str_c(first(year), last(year), sep=' - ') else
as.character(first(year)),
State = if(all(as.logical(State))) 'Expansion' else 'Contraction' ) %>%
select(State, Duration, Period)
# A tibble: 15 x 3
# State Duration Period
# <chr> <chr> <chr>
# 1 Expansion 5 1974 - 1978
# 2 Contraction 2 1979 - 1980
# 3 Expansion 1 1981
# 4 Contraction 3 1982 - 1984
# 5 Expansion 2 1985 - 1986
# 6 Contraction 2 1987 - 1988
# 7 Expansion 1 1989
# 8 Contraction 1 1990
# 9 Expansion 1 1991
#10 Contraction 3 1992 - 1994
#11 Expansion 4 1995 - 1998
#12 Contraction 10 1999 - 2008
#13 Expansion 1 2009
#14 Contraction 2 2010 - 2011
#15 Expansion 8 2012 - 2019
使用此数据(*更正)
structure(list(Date = structure(c(1461, 1826, 2191, 2557, 2922,
3287, 3652, 4018, 4383, 4748, 5113, 5479, 5844, 6209, 6574, 6940,
7305, 7670, 8035, 8401, 8766, 9131, 9496, 9862, 10227, 10592,
10957, 11323, 11688, 12053, 12418, 12784, 13149, 13514, 13879,
14245, 14610, 14975, 15340, 15706, 16071, 16436, 16801, 17167,
17532, 17897), class = "Date"), State = c(1, 1, 1, 1, 1, 0, 0,
1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), Duration = c("",
"", "", "", "5", "", "2", "1", "", "", "3", "", "2", "", "2",
"1", "1", "1", "", "", "3", "", "", "", "4", "", "", "", "",
"", "", "", "", "", "10", "1", "", "2", "", "", "", "", "", "",
"", "8")), row.names = c(NA, -46L), class = "data.frame")
我想达到这个新 data.frame:
newdf <- data.frame(State = c("Expansion", "Contraction", "Expansion", "Contraction", "Expansion", "Contraction",
"Expansion", "Contraction", "Expansion", "Contraction", "Expansion", "Contraction",
"Expansion", "Contraction", "Expansion"),
Duration = c("5", "2", "1", "3", "2", "2", "1", "1", "1", "3", "4", "10", "1", "2", "8"),
Period = c("1974 - 1978", "1979 - 1980", "1981", "1982 - 1984", "1985 - 1986", "1987 - 1988", "1989",
"1990", "1991", "1992 - 1994", "1995 - 1998", "1999 - 2008", "2009", "2010 - 2011", "2012 - 2019"))
虚拟值等于 1 会 return“扩张”(零,“收缩”)。我想 Duration 列可用于获取初始日期和最终日期。
这里有一个 tidyverse
的选项。将 'Date' 转换为 Date
class,从 'Date' 中提取 year
,根据 [=25= 中出现的数值创建分组列],按 'grp' 分组后,summarise
取 'Duration' 的 last
值,同时 paste
对 first
和 last
的 'year' 如果 'grp' 中有多行,否则 return first
'year' 并且类似地,将 'State' 更改为单值 'Expansion/Contraction' 如果 all
'State' 是 1 或 0
library(dplyr)
library(lubridate)
library(stringr)
df %>%
mutate(Date = as.Date(Date),
year = year(Date),
grp = lag(cumsum(str_detect(Duration, "\d+")), default = 0)) %>%
group_by(grp) %>%
summarise(Duration = last(Duration),
Period =if(n() > 1) str_c(first(year), last(year), sep=' - ') else
as.character(first(year)),
State = if(all(as.logical(State))) 'Expansion' else 'Contraction' ) %>%
select(State, Duration, Period)
# A tibble: 15 x 3
# State Duration Period
# <chr> <chr> <chr>
# 1 Expansion 5 1974 - 1978
# 2 Contraction 2 1979 - 1980
# 3 Expansion 1 1981
# 4 Contraction 3 1982 - 1984
# 5 Expansion 2 1985 - 1986
# 6 Contraction 2 1987 - 1988
# 7 Expansion 1 1989
# 8 Contraction 1 1990
# 9 Expansion 1 1991
#10 Contraction 3 1992 - 1994
#11 Expansion 4 1995 - 1998
#12 Contraction 10 1999 - 2008
#13 Expansion 1 2009
#14 Contraction 2 2010 - 2011
#15 Expansion 8 2012 - 2019