在数据框中创建一个新变量,条件是另一个数据框

Create a new variable in data frame with condition over another dataframe

我有 2 个这样的数据框

df1

       date item 
 02/01/2017    A 
 09/01/2017    B
 14/01/2017    C

df2

      date1       date2  item    prm
 01/01/2017  03/01/2017     A    YES
 08/01/2017  10/01/2017     B    YES
 15/01/2017  17/01/2017     C    YES

目的

prm变量是常量变量,它只有1个值。 我想在我的 df1 中添加变量 prm 条件

df1$date is between df2$date1 and df2$date2 and df1$item=df2$item

但是,如果条件不匹配,那么我需要 prm 获取值 "NO"

你可以在这里使用ifelse

 df1 <- read.table(text = "      date item 
 02/01/2017    A 
 09/01/2017    B
 16/01/2017    C", header = T)

df2 <- read.table(text = "      date1       date2  item
 01/01/2017  03/01/2017     A 
                  08/01/2017  10/01/2017     B
                  15/01/2017  17/01/2017     C", header = T)

df1$date <- as.Date(df1$date, format = "%d/%m/%Y")
df2$date1 <- as.Date(df2$date1, format = "%d/%m/%Y")
df2$date2 <- as.Date(df2$date2, format = "%d/%m/%Y")


df1$prm <- ifelse(df1$date >= df2$date1 & df1$date <= df2$date2 & df1$item == df2$item, "YES" , "NO")

        date item prm
1 0002-01-20    A YES
2 0009-01-20    B YES
3 0016-01-20    C YES

这是一个使用 dplyr 的解决方案:

library(tidyverse)

df1 = tribble(~date, ~item,
             "02/01/2017",    "A",
             "09/01/2017",    "B",
             "16/01/2017",    "C")

df2 = tribble(~date1, ~date2, ~item,
"01/01/2017",  "03/01/2017",     "A",
"08/01/2017",  "10/01/2017",     "B",
"15/01/2017",  "15/01/2017",     "C")

df3 = merge(x = df1, y = df2)


df4 = as.data.frame(cbind(df3[1], lapply(df3[2:4], as.Date, format = "%d/%m/%Y")))


df5 <- df4 %>%
  mutate(prm = if_else((date > date1) & (date < date2), "YES", "NO"))

df5

[编辑]

如果 df1df2 中的行数不同,您可以使用 sqldf 并在 df1.date between df2.date1 and df2.date2df1.item = df2.item 并使用 CASE WHEN 语句创建列 prm:

options("stringsAsFactors" = FALSE)

df1 <- read.table(text = 
"date item 
02/01/2017    A 
09/01/2017    B
16/01/2017    C 
02/01/2017    C",
header = TRUE)
df2 <- read.table(text =
"date1       date2  item
01/01/2017  03/01/2017     A 
08/01/2017  10/01/2017     B
15/01/2017  17/01/2017     C",
header = TRUE)

library(sqldf)


sqldf("
  SELECT df1.*, CASE WHEN df1.item = df2.item THEN 'yes' ELSE 'no' END AS prm
  FROM df1 
  LEFT JOIN df2 
   ON df1.date BETWEEN df2.date1 AND df2.date2
   AND df1.item = df2.item
  ")

        date item prm
1 02/01/2017    A yes
2 09/01/2017    B yes
3 16/01/2017    C yes
4 02/01/2017    C  no

使用 非等值连接更新连接 可用于 data.table 这变成:

library(data.table)
setDT(df1)[setDT(df2), on = .(item, date>=date1, date<= date2), prm := i.prm][
  is.na(prm), prm := "NO"]
df1
         date item prm
1: 2017-01-02    A YES
2: 2017-01-09    B YES
3: 2017-01-14    C  NO