在两个数据框中混合日期期间
mix date periods in two dataframe
我有一个关于客户的数据框,他们住的酒店以及酒店的到达和离开日期。
client<-data.frame( id = 1:5,
arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"),
departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
reg_com= c(12654, 12657, 12666, 12589, 12546))
client[, 2] <-as.Date(client[, 2])
client[, 3] <-as.Date(client[, 3])
# I don't care about the years
client[, 2]<-format(client[, 2], format="%m-%d")
client[, 3]<-format(client[, 3], format="%m-%d")
还有一个包含酒店和开放日期的数据框。开业日期的年份对我来说并不重要,因为酒店每年都在同一天开业。 X1O和X1C是指酒店开业的第一期,X2O和X2C是指酒店开业的第二期。
即酒店开=[X1O,X1C]和[X2O,X2C]和[X3O,X3C]。
hotel_periodes<- data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
X2C = c(NA, "2019-09-30", NA, "2019-02-30","2019-11-02"),
X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30")
)
hotel_periodes[, c(2)]<-as.Date(hotel_periodes[, c(2)])
hotel_periodes[, c(3)]<-as.Date(hotel_periodes[, c(3)])
hotel_periodes[, c(4)]<-as.Date(hotel_periodes[, c(4)])
hotel_periodes[, c(5)]<-as.Date(hotel_periodes[, c(5)])
hotel_periodes[, c(6)]<-as.Date(hotel_periodes[, c(6)])
hotel_periodes[, c(7)]<-as.Date(hotel_periodes[, c(7)])
# I don't care about year
hotel_periodes[, c(2)]<-format(hotel_periodes[, c(2)], format="%m-%d")
hotel_periodes[, c(3)]<-format(hotel_periodes[, c(3)], format="%m-%d")
hotel_periodes[, c(4)]<-format(hotel_periodes[, c(4)], format="%m-%d")
hotel_periodes[, c(5)]<-format(hotel_periodes[, c(5)], format="%m-%d")
hotel_periodes[, c(6)]<-format(hotel_periodes[, c(6)], format="%m-%d")
hotel_periodes[, c(7)]<-format(hotel_periodes[, c(7)], format="%m-%d")
我想知道客户入住时酒店是开门还是关门。
result<-data.frame( id = 1:5,
arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"),
departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
reg_com= c(12654, 12657, 12666, 12589, 12546),
OPEN_HOTEL= c(FALSE, TRUE, FALSE, FALSE, TRUE )
)
一种可能的解决方案如下。首先,我以正确的格式准备 hotel_periods 数据,将其从宽转换为长。
library(tidyverse)
library(lubridate)
hotel_periodes <- hotel_periodes %>%
gather(period, times, -reg_com) %>%
mutate(oc = str_extract(period, "[OC]"),
oc = if_else(oc == "O", "open", "close"),
period = as.numeric(str_extract(period, "\d")),
times = as.Date(times)) %>%
spread(oc, times) %>%
filter(!is.na(open)) %>%
select(reg_com, period, open, close)
hotel_periodes
# reg_com period open close
# 1 12546 1 2019-04-01 2019-06-01
# 2 12546 2 2019-07-01 2019-11-02
# 3 12546 3 2019-12-01 2019-12-30
# 4 12589 1 2018-12-01 2019-01-01
# 5 12589 2 2019-02-20 2019-02-28
# 6 12589 3 2019-06-20 2019-11-01
# 7 12654 1 2018-12-01 2019-04-01
# 8 12657 1 2019-03-04 2019-05-04
# 9 12657 2 2019-06-30 2019-09-30
# 10 12657 3 2019-12-01 2019-01-30
# 11 12666 1 2019-04-30 2019-12-31
然后我将此 data.frame 与客户数据合并。你说年份对你来说不重要。然而,我们需要正确的年份来比较日期。正如你所说的,多年来开放时间保持不变,我做了一个小技巧,将结束年份设置为出发年份,将开放年份设置为出发年份或一年之前(以捕获酒店开放的日期从 12 月到 1 月)。
然后我比较开放、关闭、到达和离开日期,如果到达和离开位于开放和关闭之间,则 return TRUE。最后我总结了每个客户、酒店和到达和离开日期的结果。
client <- client %>%
mutate(arrive = as.Date(arrive),
departure = as.Date(departure))
left_join(client, hotel_periodes, by = "reg_com") %>%
mutate(close = `year<-`(close, year(departure)),
open = if_else(`year<-`(open, year(departure)) <= close,
`year<-`(open, year(departure)),
`year<-`(open, year(departure) - 1)),
between = open <= arrive & departure <= close) %>%
group_by(id, arrive, departure, reg_com) %>%
summarize(OPEN_HOTEL = any(between))
# A tibble: 5 x 5
# Groups: id, arrive, departure [5]
# id arrive departure reg_com OPEN_HOTEL
# <int> <date> <date> <dbl> <lgl>
# 1 1 2019-05-01 2019-05-31 12654 FALSE
# 2 2 2018-01-03 2018-01-21 12657 TRUE
# 3 3 2019-04-05 2019-04-25 12666 FALSE
# 4 4 2015-05-03 2015-05-13 12589 FALSE
# 5 5 2017-12-02 2017-12-30 12546 TRUE
数据
注意:我手动把日期2019-02-30
改成了2019-02-28
,因为这不是问题的重点。尽管如此,还是有必要在合并 data.frame 之前验证日期。
client <-
data.frame(id = 1:5,
arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"),
departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
reg_com= c(12654, 12657, 12666, 12589, 12546),
stringsAsFactors = FALSE)
hotel_periodes <-
data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
X2C = c(NA, "2019-09-30", NA, "2019-02-28","2019-11-02"),
X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30"),
stringsAsFactors = FALSE)
这终于对我有用了:
y<- merge(client, hotel_periodes, by.x="reg_com", all.x=TRUE)
is.between<-function(x, a, b) {
for( i in 1:length(a)){
if(a[i]>b[i]){
year(a[i])<-2018
}}
(x > a) & (b > x)
return((x > a) & (b > x) )
}
y$arrive<- as.Date(y$arrive, '%m-%d')
y$departure<- as.Date(y$departure, '%m-%d')
y$x1O<- as.Date(y$x1O, '%m-%d')
y$X1C<- as.Date(y$X1C, '%m-%d')
y$X2O<- as.Date(y$X2O, '%m-%d')
y$X2C<- as.Date(y$X2C, '%m-%d')
y$X3O<- as.Date(y$X3O, '%m-%d')
y$X3C<- as.Date(y$X3C, '%m-%d')
y[is.na(y[, c(5:10)])]<-"1999-12-12"
y$x1O[is.na(y$x1O)]<-"1999-12-12"
y$X1C[is.na(y$X1C)]<-"1999-12-12"
y$X2O[is.na(y$X2O)]<-"1999-12-12"
y$X2C[is.na(y$X2C)]<-"1999-12-12"
y$X3O[is.na(y$X3O)]<-"1999-12-12"
y$X3C[is.na(y$X3C)]<-"1999-12-12"
y[,"correct" ]<- is.between(y$arrive , y$x1O , y$X1C) | is.between(y$arrive , y$X2O, y$X2C) |is.between(y$arrive , y$X3O, y$X3C) | is.between(y$departure, y$x1O , y$X1C) | is.between(y$departure , y$X2O, y$X2C) |is.between(y$departure , y$X3O, y$X3C)
我有一个关于客户的数据框,他们住的酒店以及酒店的到达和离开日期。
client<-data.frame( id = 1:5,
arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"),
departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
reg_com= c(12654, 12657, 12666, 12589, 12546))
client[, 2] <-as.Date(client[, 2])
client[, 3] <-as.Date(client[, 3])
# I don't care about the years
client[, 2]<-format(client[, 2], format="%m-%d")
client[, 3]<-format(client[, 3], format="%m-%d")
还有一个包含酒店和开放日期的数据框。开业日期的年份对我来说并不重要,因为酒店每年都在同一天开业。 X1O和X1C是指酒店开业的第一期,X2O和X2C是指酒店开业的第二期。 即酒店开=[X1O,X1C]和[X2O,X2C]和[X3O,X3C]。
hotel_periodes<- data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
X2C = c(NA, "2019-09-30", NA, "2019-02-30","2019-11-02"),
X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30")
)
hotel_periodes[, c(2)]<-as.Date(hotel_periodes[, c(2)])
hotel_periodes[, c(3)]<-as.Date(hotel_periodes[, c(3)])
hotel_periodes[, c(4)]<-as.Date(hotel_periodes[, c(4)])
hotel_periodes[, c(5)]<-as.Date(hotel_periodes[, c(5)])
hotel_periodes[, c(6)]<-as.Date(hotel_periodes[, c(6)])
hotel_periodes[, c(7)]<-as.Date(hotel_periodes[, c(7)])
# I don't care about year
hotel_periodes[, c(2)]<-format(hotel_periodes[, c(2)], format="%m-%d")
hotel_periodes[, c(3)]<-format(hotel_periodes[, c(3)], format="%m-%d")
hotel_periodes[, c(4)]<-format(hotel_periodes[, c(4)], format="%m-%d")
hotel_periodes[, c(5)]<-format(hotel_periodes[, c(5)], format="%m-%d")
hotel_periodes[, c(6)]<-format(hotel_periodes[, c(6)], format="%m-%d")
hotel_periodes[, c(7)]<-format(hotel_periodes[, c(7)], format="%m-%d")
我想知道客户入住时酒店是开门还是关门。
result<-data.frame( id = 1:5,
arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"),
departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
reg_com= c(12654, 12657, 12666, 12589, 12546),
OPEN_HOTEL= c(FALSE, TRUE, FALSE, FALSE, TRUE )
)
一种可能的解决方案如下。首先,我以正确的格式准备 hotel_periods 数据,将其从宽转换为长。
library(tidyverse)
library(lubridate)
hotel_periodes <- hotel_periodes %>%
gather(period, times, -reg_com) %>%
mutate(oc = str_extract(period, "[OC]"),
oc = if_else(oc == "O", "open", "close"),
period = as.numeric(str_extract(period, "\d")),
times = as.Date(times)) %>%
spread(oc, times) %>%
filter(!is.na(open)) %>%
select(reg_com, period, open, close)
hotel_periodes
# reg_com period open close
# 1 12546 1 2019-04-01 2019-06-01
# 2 12546 2 2019-07-01 2019-11-02
# 3 12546 3 2019-12-01 2019-12-30
# 4 12589 1 2018-12-01 2019-01-01
# 5 12589 2 2019-02-20 2019-02-28
# 6 12589 3 2019-06-20 2019-11-01
# 7 12654 1 2018-12-01 2019-04-01
# 8 12657 1 2019-03-04 2019-05-04
# 9 12657 2 2019-06-30 2019-09-30
# 10 12657 3 2019-12-01 2019-01-30
# 11 12666 1 2019-04-30 2019-12-31
然后我将此 data.frame 与客户数据合并。你说年份对你来说不重要。然而,我们需要正确的年份来比较日期。正如你所说的,多年来开放时间保持不变,我做了一个小技巧,将结束年份设置为出发年份,将开放年份设置为出发年份或一年之前(以捕获酒店开放的日期从 12 月到 1 月)。 然后我比较开放、关闭、到达和离开日期,如果到达和离开位于开放和关闭之间,则 return TRUE。最后我总结了每个客户、酒店和到达和离开日期的结果。
client <- client %>%
mutate(arrive = as.Date(arrive),
departure = as.Date(departure))
left_join(client, hotel_periodes, by = "reg_com") %>%
mutate(close = `year<-`(close, year(departure)),
open = if_else(`year<-`(open, year(departure)) <= close,
`year<-`(open, year(departure)),
`year<-`(open, year(departure) - 1)),
between = open <= arrive & departure <= close) %>%
group_by(id, arrive, departure, reg_com) %>%
summarize(OPEN_HOTEL = any(between))
# A tibble: 5 x 5
# Groups: id, arrive, departure [5]
# id arrive departure reg_com OPEN_HOTEL
# <int> <date> <date> <dbl> <lgl>
# 1 1 2019-05-01 2019-05-31 12654 FALSE
# 2 2 2018-01-03 2018-01-21 12657 TRUE
# 3 3 2019-04-05 2019-04-25 12666 FALSE
# 4 4 2015-05-03 2015-05-13 12589 FALSE
# 5 5 2017-12-02 2017-12-30 12546 TRUE
数据
注意:我手动把日期2019-02-30
改成了2019-02-28
,因为这不是问题的重点。尽管如此,还是有必要在合并 data.frame 之前验证日期。
client <-
data.frame(id = 1:5,
arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"),
departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
reg_com= c(12654, 12657, 12666, 12589, 12546),
stringsAsFactors = FALSE)
hotel_periodes <-
data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
X2C = c(NA, "2019-09-30", NA, "2019-02-28","2019-11-02"),
X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30"),
stringsAsFactors = FALSE)
这终于对我有用了:
y<- merge(client, hotel_periodes, by.x="reg_com", all.x=TRUE)
is.between<-function(x, a, b) {
for( i in 1:length(a)){
if(a[i]>b[i]){
year(a[i])<-2018
}}
(x > a) & (b > x)
return((x > a) & (b > x) )
}
y$arrive<- as.Date(y$arrive, '%m-%d')
y$departure<- as.Date(y$departure, '%m-%d')
y$x1O<- as.Date(y$x1O, '%m-%d')
y$X1C<- as.Date(y$X1C, '%m-%d')
y$X2O<- as.Date(y$X2O, '%m-%d')
y$X2C<- as.Date(y$X2C, '%m-%d')
y$X3O<- as.Date(y$X3O, '%m-%d')
y$X3C<- as.Date(y$X3C, '%m-%d')
y[is.na(y[, c(5:10)])]<-"1999-12-12"
y$x1O[is.na(y$x1O)]<-"1999-12-12"
y$X1C[is.na(y$X1C)]<-"1999-12-12"
y$X2O[is.na(y$X2O)]<-"1999-12-12"
y$X2C[is.na(y$X2C)]<-"1999-12-12"
y$X3O[is.na(y$X3O)]<-"1999-12-12"
y$X3C[is.na(y$X3C)]<-"1999-12-12"
y[,"correct" ]<- is.between(y$arrive , y$x1O , y$X1C) | is.between(y$arrive , y$X2O, y$X2C) |is.between(y$arrive , y$X3O, y$X3C) | is.between(y$departure, y$x1O , y$X1C) | is.between(y$departure , y$X2O, y$X2C) |is.between(y$departure , y$X3O, y$X3C)