在数据框中创建一个新变量,条件是另一个数据框
Create a new variable in data frame with condition over another dataframe
我有 2 个这样的数据框
df1
date item
02/01/2017 A
09/01/2017 B
14/01/2017 C
df2
date1 date2 item prm
01/01/2017 03/01/2017 A YES
08/01/2017 10/01/2017 B YES
15/01/2017 17/01/2017 C YES
目的
prm
变量是常量变量,它只有1个值。
我想在我的 df1 中添加变量 prm
条件
df1$date is between df2$date1 and df2$date2 and df1$item=df2$item
但是,如果条件不匹配,那么我需要 prm
获取值 "NO"
你可以在这里使用ifelse
df1 <- read.table(text = " date item
02/01/2017 A
09/01/2017 B
16/01/2017 C", header = T)
df2 <- read.table(text = " date1 date2 item
01/01/2017 03/01/2017 A
08/01/2017 10/01/2017 B
15/01/2017 17/01/2017 C", header = T)
df1$date <- as.Date(df1$date, format = "%d/%m/%Y")
df2$date1 <- as.Date(df2$date1, format = "%d/%m/%Y")
df2$date2 <- as.Date(df2$date2, format = "%d/%m/%Y")
df1$prm <- ifelse(df1$date >= df2$date1 & df1$date <= df2$date2 & df1$item == df2$item, "YES" , "NO")
date item prm
1 0002-01-20 A YES
2 0009-01-20 B YES
3 0016-01-20 C YES
这是一个使用 dplyr 的解决方案:
library(tidyverse)
df1 = tribble(~date, ~item,
"02/01/2017", "A",
"09/01/2017", "B",
"16/01/2017", "C")
df2 = tribble(~date1, ~date2, ~item,
"01/01/2017", "03/01/2017", "A",
"08/01/2017", "10/01/2017", "B",
"15/01/2017", "15/01/2017", "C")
df3 = merge(x = df1, y = df2)
df4 = as.data.frame(cbind(df3[1], lapply(df3[2:4], as.Date, format = "%d/%m/%Y")))
df5 <- df4 %>%
mutate(prm = if_else((date > date1) & (date < date2), "YES", "NO"))
df5
[编辑]
如果 df1
和 df2
中的行数不同,您可以使用 sqldf
并在 df1.date between df2.date1 and df2.date2
和 df1.item = df2.item
并使用 CASE WHEN
语句创建列 prm
:
options("stringsAsFactors" = FALSE)
df1 <- read.table(text =
"date item
02/01/2017 A
09/01/2017 B
16/01/2017 C
02/01/2017 C",
header = TRUE)
df2 <- read.table(text =
"date1 date2 item
01/01/2017 03/01/2017 A
08/01/2017 10/01/2017 B
15/01/2017 17/01/2017 C",
header = TRUE)
library(sqldf)
sqldf("
SELECT df1.*, CASE WHEN df1.item = df2.item THEN 'yes' ELSE 'no' END AS prm
FROM df1
LEFT JOIN df2
ON df1.date BETWEEN df2.date1 AND df2.date2
AND df1.item = df2.item
")
date item prm
1 02/01/2017 A yes
2 09/01/2017 B yes
3 16/01/2017 C yes
4 02/01/2017 C no
使用 非等值连接 和 更新连接 可用于 data.table
这变成:
library(data.table)
setDT(df1)[setDT(df2), on = .(item, date>=date1, date<= date2), prm := i.prm][
is.na(prm), prm := "NO"]
df1
date item prm
1: 2017-01-02 A YES
2: 2017-01-09 B YES
3: 2017-01-14 C NO
我有 2 个这样的数据框
df1
date item
02/01/2017 A
09/01/2017 B
14/01/2017 C
df2
date1 date2 item prm
01/01/2017 03/01/2017 A YES
08/01/2017 10/01/2017 B YES
15/01/2017 17/01/2017 C YES
目的
prm
变量是常量变量,它只有1个值。
我想在我的 df1 中添加变量 prm
条件
df1$date is between df2$date1 and df2$date2 and df1$item=df2$item
但是,如果条件不匹配,那么我需要 prm
获取值 "NO"
你可以在这里使用ifelse
df1 <- read.table(text = " date item
02/01/2017 A
09/01/2017 B
16/01/2017 C", header = T)
df2 <- read.table(text = " date1 date2 item
01/01/2017 03/01/2017 A
08/01/2017 10/01/2017 B
15/01/2017 17/01/2017 C", header = T)
df1$date <- as.Date(df1$date, format = "%d/%m/%Y")
df2$date1 <- as.Date(df2$date1, format = "%d/%m/%Y")
df2$date2 <- as.Date(df2$date2, format = "%d/%m/%Y")
df1$prm <- ifelse(df1$date >= df2$date1 & df1$date <= df2$date2 & df1$item == df2$item, "YES" , "NO")
date item prm
1 0002-01-20 A YES
2 0009-01-20 B YES
3 0016-01-20 C YES
这是一个使用 dplyr 的解决方案:
library(tidyverse)
df1 = tribble(~date, ~item,
"02/01/2017", "A",
"09/01/2017", "B",
"16/01/2017", "C")
df2 = tribble(~date1, ~date2, ~item,
"01/01/2017", "03/01/2017", "A",
"08/01/2017", "10/01/2017", "B",
"15/01/2017", "15/01/2017", "C")
df3 = merge(x = df1, y = df2)
df4 = as.data.frame(cbind(df3[1], lapply(df3[2:4], as.Date, format = "%d/%m/%Y")))
df5 <- df4 %>%
mutate(prm = if_else((date > date1) & (date < date2), "YES", "NO"))
df5
[编辑]
如果 df1
和 df2
中的行数不同,您可以使用 sqldf
并在 df1.date between df2.date1 and df2.date2
和 df1.item = df2.item
并使用 CASE WHEN
语句创建列 prm
:
options("stringsAsFactors" = FALSE)
df1 <- read.table(text =
"date item
02/01/2017 A
09/01/2017 B
16/01/2017 C
02/01/2017 C",
header = TRUE)
df2 <- read.table(text =
"date1 date2 item
01/01/2017 03/01/2017 A
08/01/2017 10/01/2017 B
15/01/2017 17/01/2017 C",
header = TRUE)
library(sqldf)
sqldf("
SELECT df1.*, CASE WHEN df1.item = df2.item THEN 'yes' ELSE 'no' END AS prm
FROM df1
LEFT JOIN df2
ON df1.date BETWEEN df2.date1 AND df2.date2
AND df1.item = df2.item
")
date item prm
1 02/01/2017 A yes
2 09/01/2017 B yes
3 16/01/2017 C yes
4 02/01/2017 C no
使用 非等值连接 和 更新连接 可用于 data.table
这变成:
library(data.table)
setDT(df1)[setDT(df2), on = .(item, date>=date1, date<= date2), prm := i.prm][
is.na(prm), prm := "NO"]
df1
date item prm 1: 2017-01-02 A YES 2: 2017-01-09 B YES 3: 2017-01-14 C NO