R:使用日期和 ID 按多个条件合并 2 个数据框
R: Merge 2 Data Frame by Multiple Condition Using Dates & ID
我正在尝试使用多个条件合并 2 个数据框并使用了合并命令但无法获得成功的输出。
#Data Frame df1#
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
df1<- data.frame(ID,Location, startdate, enddate)
#Data Frame df2#
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")
df2<- data.frame(ID, N, Loss_Date,Amt)
我想通过使用 Location 作为公共列来合并这 2 个数据框,并且 df2 中的 Loss_Date 位于 df2 中(含)Start_Date 和 End_Date 之间。 你可以看到 df2 中的第二个条目没有被映射,因为日期不在 df1
的范围内
#Required Output
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
N<-c(2,0,0,2)
Loss_Date <- c("2014-11-15", "NA", "NA", "2015-11-30")
Amt<-c("2200","0","0", "500")
Output<- data.frame(ID,Location, startdate, enddate,N, Loss_Date,Amt)
我使用年份和 ID 创建了一个通用 ID,但得到了错误的映射。尝试了各种使用合并和匹配的方法,但该命令不起作用。我需要这个来 运行 超过 170K 的观察。两个数据帧的长度不相等。任何帮助将非常感激。
我使用包 dplyr
完成了合并,它非常快速且易于使用。
您应该将此 stringsAsFactors=F
添加到您的数据框定义中
df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)
df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)
因此您的字符输入不会更改为因子,它们不会给您带来不希望的结果
install.packages("dplyr")
library(dplyr)
output <- full_join(df1, df2, by="ID") %>%
filter(Loss_Date >= startdate & Loss_Date <= enddate)
输出:
ID Location startdate enddate N Loss_Date Amt
1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
2 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
再次说明,如注释所指,如果要保留不符合条件的行,应使用另一个函数:
output2 <- left_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
mutate(condition = ifelse(is.na(condition),T,condition)) %>%
filter(condition) %>%
select(-condition)
首先创建一个符合条件的新列,然后根据该条件将其他列更改为0
或NA
。最后,取消选择新生成的列。 (注意 ifelse
将 Date
的 class 更改为 numeric
,因此需要 as.Date
)
ID Location startdate enddate N Loss_Date Amt
1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
2 A2 234B 2014-01-01 2014-08-31 0 <NA> 0
3 A3 012A 2015-10-01 2015-12-31 0 <NA> 0
4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 50
我刚刚在@VincentBoned 的回答中添加了一些额外的代码。
# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)
# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")
df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)
library(dplyr)
full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
select(-condition) %>%
group_by(ID) %>% # for each ID
mutate(Nrows = n()) %>% # count how many rows they have in the final table
ungroup() %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>% # filter out rows with IDs that have more than 1 rows and those rows are not matched
select(-Nrows)
# ID Location startdate enddate N Loss_Date Amt
# 1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
# 2 A2 234B 2014-01-01 2014-08-31 0 <NA> 0
# 3 A3 012A 2015-10-01 2015-12-31 0 <NA> 0
# 4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
如果您了解上述代码的工作原理(逐步),您可以使用更紧凑的版本 returns 相同的结果:
full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
N = ifelse(condition & !is.na(condition), N, 0),
Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
group_by(ID) %>%
mutate(Nrows = n()) %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
select(-c(condition, Nrows))
sqldf 非常健壮且易于阅读。检查此代码:
library(sqldf)
Output<-sqldf("
SELECT L.*, r.N, r.Loss_Date, r.Amt
FROM df1 as L
LEFT JOIN df2 as r
ON
L.ID=r.ID AND
r.Loss_Date BETWEEN L.startdate AND L.enddate
ORDER BY L.ID")
其中"L"表示df1(即df1为l),"r"表示df2(df2为r)。
在data.table(v1.9.7)的当前开发版本中,实现了非等值连接。有了它,我们可以做到:
require(data.table) # v1.9.7+
setDT(df2)[df1, .(ID, Location, startdate, enddate, N, x.Loss_Date, Amt),
on=.(ID, Loss_Date>=startdate, Loss_Date<=enddate)]
# ID Location startdate enddate N x.Loss_Date Amt
# 1: A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
# 2: A2 234B 2014-01-01 2014-08-31 NA <NA> NA
# 3: A3 012A 2015-10-01 2015-12-31 NA <NA> NA
# 4: A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
我正在尝试使用多个条件合并 2 个数据框并使用了合并命令但无法获得成功的输出。
#Data Frame df1#
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
df1<- data.frame(ID,Location, startdate, enddate)
#Data Frame df2#
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")
df2<- data.frame(ID, N, Loss_Date,Amt)
我想通过使用 Location 作为公共列来合并这 2 个数据框,并且 df2 中的 Loss_Date 位于 df2 中(含)Start_Date 和 End_Date 之间。 你可以看到 df2 中的第二个条目没有被映射,因为日期不在 df1
的范围内#Required Output
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
N<-c(2,0,0,2)
Loss_Date <- c("2014-11-15", "NA", "NA", "2015-11-30")
Amt<-c("2200","0","0", "500")
Output<- data.frame(ID,Location, startdate, enddate,N, Loss_Date,Amt)
我使用年份和 ID 创建了一个通用 ID,但得到了错误的映射。尝试了各种使用合并和匹配的方法,但该命令不起作用。我需要这个来 运行 超过 170K 的观察。两个数据帧的长度不相等。任何帮助将非常感激。
我使用包 dplyr
完成了合并,它非常快速且易于使用。
您应该将此 stringsAsFactors=F
df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)
df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)
因此您的字符输入不会更改为因子,它们不会给您带来不希望的结果
install.packages("dplyr")
library(dplyr)
output <- full_join(df1, df2, by="ID") %>%
filter(Loss_Date >= startdate & Loss_Date <= enddate)
输出:
ID Location startdate enddate N Loss_Date Amt
1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
2 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
再次说明,如注释所指,如果要保留不符合条件的行,应使用另一个函数:
output2 <- left_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
mutate(condition = ifelse(is.na(condition),T,condition)) %>%
filter(condition) %>%
select(-condition)
首先创建一个符合条件的新列,然后根据该条件将其他列更改为0
或NA
。最后,取消选择新生成的列。 (注意 ifelse
将 Date
的 class 更改为 numeric
,因此需要 as.Date
)
ID Location startdate enddate N Loss_Date Amt
1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
2 A2 234B 2014-01-01 2014-08-31 0 <NA> 0
3 A3 012A 2015-10-01 2015-12-31 0 <NA> 0
4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 50
我刚刚在@VincentBoned 的回答中添加了一些额外的代码。
# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)
# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")
df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)
library(dplyr)
full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
select(-condition) %>%
group_by(ID) %>% # for each ID
mutate(Nrows = n()) %>% # count how many rows they have in the final table
ungroup() %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>% # filter out rows with IDs that have more than 1 rows and those rows are not matched
select(-Nrows)
# ID Location startdate enddate N Loss_Date Amt
# 1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
# 2 A2 234B 2014-01-01 2014-08-31 0 <NA> 0
# 3 A3 012A 2015-10-01 2015-12-31 0 <NA> 0
# 4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
如果您了解上述代码的工作原理(逐步),您可以使用更紧凑的版本 returns 相同的结果:
full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
N = ifelse(condition & !is.na(condition), N, 0),
Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
group_by(ID) %>%
mutate(Nrows = n()) %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
select(-c(condition, Nrows))
sqldf 非常健壮且易于阅读。检查此代码:
library(sqldf)
Output<-sqldf("
SELECT L.*, r.N, r.Loss_Date, r.Amt
FROM df1 as L
LEFT JOIN df2 as r
ON
L.ID=r.ID AND
r.Loss_Date BETWEEN L.startdate AND L.enddate
ORDER BY L.ID")
其中"L"表示df1(即df1为l),"r"表示df2(df2为r)。
在data.table(v1.9.7)的当前开发版本中,实现了非等值连接。有了它,我们可以做到:
require(data.table) # v1.9.7+
setDT(df2)[df1, .(ID, Location, startdate, enddate, N, x.Loss_Date, Amt),
on=.(ID, Loss_Date>=startdate, Loss_Date<=enddate)]
# ID Location startdate enddate N x.Loss_Date Amt
# 1: A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
# 2: A2 234B 2014-01-01 2014-08-31 NA <NA> NA
# 3: A3 012A 2015-10-01 2015-12-31 NA <NA> NA
# 4: A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500