在两个数据框中混合日期期间

mix date periods in two dataframe

我有一个关于客户的数据框,他们住的酒店以及酒店的到达和离开日期。

client<-data.frame( id = 1:5, 
                    arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"), 
                    departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
                    reg_com= c(12654, 12657, 12666, 12589, 12546)) 

client[, 2] <-as.Date(client[, 2])
client[, 3] <-as.Date(client[, 3])

# I don't care about the years
client[, 2]<-format(client[, 2], format="%m-%d")
client[, 3]<-format(client[, 3], format="%m-%d")

还有一个包含酒店和开放日期的数据框。开业日期的年份对我来说并不重要,因为酒店每年都在同一天开业。 X1O和X1C是指酒店开业的第一期,X2O和X2C是指酒店开业的第二期。 即酒店开=[X1O,X1C]和[X2O,X2C]和[X3O,X3C]。

hotel_periodes<- data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
                  x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
                  X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
                  X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
                  X2C = c(NA, "2019-09-30", NA, "2019-02-30","2019-11-02"),
                  X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
                  X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30")
                  )
hotel_periodes[, c(2)]<-as.Date(hotel_periodes[, c(2)])
hotel_periodes[, c(3)]<-as.Date(hotel_periodes[, c(3)])
hotel_periodes[, c(4)]<-as.Date(hotel_periodes[, c(4)])
hotel_periodes[, c(5)]<-as.Date(hotel_periodes[, c(5)])
hotel_periodes[, c(6)]<-as.Date(hotel_periodes[, c(6)])
hotel_periodes[, c(7)]<-as.Date(hotel_periodes[, c(7)])

# I don't care about year
hotel_periodes[, c(2)]<-format(hotel_periodes[, c(2)], format="%m-%d")
hotel_periodes[, c(3)]<-format(hotel_periodes[, c(3)], format="%m-%d")
hotel_periodes[, c(4)]<-format(hotel_periodes[, c(4)], format="%m-%d")
hotel_periodes[, c(5)]<-format(hotel_periodes[, c(5)], format="%m-%d")
hotel_periodes[, c(6)]<-format(hotel_periodes[, c(6)], format="%m-%d")
hotel_periodes[, c(7)]<-format(hotel_periodes[, c(7)], format="%m-%d")

我想知道客户入住时酒店是开门还是关门。


result<-data.frame( id = 1:5, 
                    arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"), 
                    departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
                    reg_com= c(12654, 12657, 12666, 12589, 12546), 
                    OPEN_HOTEL= c(FALSE, TRUE, FALSE, FALSE, TRUE )
                    ) 

一种可能的解决方案如下。首先,我以正确的格式准备 hotel_periods 数据,将其从宽转换为长。

library(tidyverse)
library(lubridate)

hotel_periodes <- hotel_periodes %>% 
  gather(period, times, -reg_com) %>% 
  mutate(oc = str_extract(period, "[OC]"), 
         oc = if_else(oc == "O", "open", "close"), 
         period = as.numeric(str_extract(period, "\d")), 
         times = as.Date(times)) %>% 
  spread(oc, times) %>% 
  filter(!is.na(open)) %>% 
  select(reg_com, period, open, close)

hotel_periodes
#    reg_com period       open      close
# 1    12546      1 2019-04-01 2019-06-01
# 2    12546      2 2019-07-01 2019-11-02
# 3    12546      3 2019-12-01 2019-12-30
# 4    12589      1 2018-12-01 2019-01-01
# 5    12589      2 2019-02-20 2019-02-28
# 6    12589      3 2019-06-20 2019-11-01
# 7    12654      1 2018-12-01 2019-04-01
# 8    12657      1 2019-03-04 2019-05-04
# 9    12657      2 2019-06-30 2019-09-30
# 10   12657      3 2019-12-01 2019-01-30
# 11   12666      1 2019-04-30 2019-12-31

然后我将此 data.frame 与客户数据合并。你说年份对你来说不重要。然而,我们需要正确的年份来比较日期。正如你所说的,多年来开放时间保持不变,我做了一个小技巧,将结束年份设置为出发年份,将开放年份设置为出发年份或一年之前(以捕获酒店开放的日期从 12 月到 1 月)。 然后我比较开放、关闭、到达和离开日期,如果到达和离开位于开放和关闭之间,则 return TRUE。最后我总结了每个客户、酒店和到达和离开日期的结果。

client <- client %>% 
  mutate(arrive = as.Date(arrive), 
         departure = as.Date(departure))

left_join(client, hotel_periodes, by = "reg_com") %>% 
  mutate(close = `year<-`(close, year(departure)),
         open = if_else(`year<-`(open, year(departure)) <= close, 
                        `year<-`(open, year(departure)), 
                        `year<-`(open, year(departure) - 1)),
         between = open <= arrive & departure <= close) %>% 
  group_by(id, arrive, departure, reg_com) %>% 
  summarize(OPEN_HOTEL = any(between))

# A tibble: 5 x 5
# Groups:   id, arrive, departure [5]
#      id arrive     departure  reg_com OPEN_HOTEL
#   <int> <date>     <date>       <dbl> <lgl>  
# 1     1 2019-05-01 2019-05-31   12654 FALSE  
# 2     2 2018-01-03 2018-01-21   12657 TRUE   
# 3     3 2019-04-05 2019-04-25   12666 FALSE  
# 4     4 2015-05-03 2015-05-13   12589 FALSE  
# 5     5 2017-12-02 2017-12-30   12546 TRUE 

数据

注意:我手动把日期2019-02-30改成了2019-02-28,因为这不是问题的重点。尽管如此,还是有必要在合并 data.frame 之前验证日期。

client <- 
  data.frame(id = 1:5, 
             arrive = c("2019-05-01", "2018-01-03", "2019-04-05", "2015-05-03", "2017-12-02"), 
             departure = c("2019-05-31", "2018-01-21", "2019-04-25", "2015-05-13", "2017-12-30"),
             reg_com= c(12654, 12657, 12666, 12589, 12546), 
             stringsAsFactors = FALSE) 

hotel_periodes <- 
  data.frame(reg_com = c(12654, 12657, 12666, 12589, 12546),
             x1O = c("2018-12-01", "2019-03-04", "2019-04-30", "2018-12-01","2019-04-01"),
             X1C = c("2019-04-01", "2019-05-04", "2019-12-31", "2019-01-01", "2019-06-01"),
             X2O = c(NA, "2019-06-30", NA, "2019-02-20", "2019-07-01"),
             X2C = c(NA, "2019-09-30", NA, "2019-02-28","2019-11-02"),
             X3O = c(NA, "2019-12-01", NA, "2019-06-20","2019-12-01"),
             X3C = c(NA, "2019-01-30", NA, "2019-11-01","2019-12-30"), 
             stringsAsFactors = FALSE)

这终于对我有用了:

y<- merge(client, hotel_periodes, by.x="reg_com", all.x=TRUE)


is.between<-function(x, a, b) {
 for( i in 1:length(a)){
   if(a[i]>b[i]){
    year(a[i])<-2018
  }}
  (x > a) & (b > x) 
  return((x > a) & (b > x) )
} 


y$arrive<- as.Date(y$arrive, '%m-%d')
y$departure<- as.Date(y$departure, '%m-%d')

y$x1O<- as.Date(y$x1O, '%m-%d')

y$X1C<- as.Date(y$X1C, '%m-%d')

y$X2O<- as.Date(y$X2O, '%m-%d')

y$X2C<- as.Date(y$X2C, '%m-%d')

y$X3O<- as.Date(y$X3O, '%m-%d')

y$X3C<- as.Date(y$X3C, '%m-%d')

y[is.na(y[, c(5:10)])]<-"1999-12-12"
y$x1O[is.na(y$x1O)]<-"1999-12-12"
y$X1C[is.na(y$X1C)]<-"1999-12-12"
y$X2O[is.na(y$X2O)]<-"1999-12-12"
y$X2C[is.na(y$X2C)]<-"1999-12-12"
y$X3O[is.na(y$X3O)]<-"1999-12-12"
y$X3C[is.na(y$X3C)]<-"1999-12-12"

y[,"correct" ]<- is.between(y$arrive , y$x1O , y$X1C) | is.between(y$arrive ,  y$X2O, y$X2C)  |is.between(y$arrive ,  y$X3O, y$X3C) | is.between(y$departure, y$x1O , y$X1C) | is.between(y$departure ,  y$X2O, y$X2C)  |is.between(y$departure ,  y$X3O, y$X3C)