基于引用中日期的子集 table

Question

我有 table1 如下。

   StudentId        Date1         Lunch
   23433            2014-08-26    Yes
   233989           2014-08-18    No
   909978           2014-08-06    No
   777492           2014-08-11    Yes
   3987387          2014-08-26    No

我还有一个table，table2如下

Id  StudentId        Date2        Result_Nm
1   777492           2012.06.10   0.1
2   777492           2013.12.06   2.0
3   777492           2014.08.30   0.6
4   23433            2011.08.26   3.0
5   23433            2015.04.06   3.0
6   233989           2011.05.14   0.003
7   233989           2014.09.14   0.05
8   909978           2004-09-12   0.2
9   909978           2005-05-10   0.23
10  909978           2015-01-02   2.4
11  3987387          2014-10-06   3.5
12  3987387          2014-08-26   1.17

我只想保留来自 table2 数据集的观察结果，其中 Date2 值小于每个 StudentId 的 Date1 值。换句话说，它应该包含这些行。

  Id  StudentId        Date2         Result_Nm
  1   777492           2012.06.10    0.1
  2   777492           2013.12.06    2.0
  4   23433            2011.08.26    3.0
  6   233989           2014.09.14    0.05
  8   909978           2004-09-12    0.2
  9   909978           2005-05-10    0.23
  12  3987387          2014-08-26    1.17

观察3被排除，因为StudentId 777492的Date1值是2014-08-11，这个值小于2014.08.30，类似观察5,7,10,11等等。我以前使用过子集，但这更具挑战性，需要帮助。

Answer 1

我们可以使用 lubridate 中的 ymd 将 'Date' 列更改为 'Date' class。它可以采用多种格式（.、-）。通过 'StudentId' 连接两个数据集 (left_join)，使用 filter 和 select 删除行，特定列

library(lubridate) 
library(dplyr)
df2$Date2 <- ymd(df2$Date2)
df1$Date1 <- ymd(df1$Date1)

left_join(df2, df1, by='StudentId') %>% 
                     filter(Date2 <=Date1) %>% 
                     select(1:4)
#    Id StudentId      Date2 Result_Nm
#1  1    777492 2012-06-10     0.100
#2  2    777492 2013-12-06     2.000
#3  4     23433 2011-08-26     3.000
#4  6    233989 2011-05-14     0.003
#5  8    909978 2004-09-12     0.200
#6  9    909978 2005-05-10     0.230
#7 12   3987387 2014-08-26     1.170

或者我们可以使用data.table。这里我们将'df2'从'data.frame'转换为'data.table'（setDT），设置key为'StudentId'（setkey(..., StudentId)），join 与 'df1' ('StudentId', 'Date1') 的子集，根据 'key' 变量分组的条件 (.SD[Date2 <= Date1]) 过滤输出数据集。关于 .EACHI 的更多信息是 here

library(data.table)
setkey(setDT(df2),StudentId)[df1[1:2], .SD[Date2<=Date1],by=.EACHI][order(Id)]
#   StudentId Id      Date2 Result_Nm
#1:    777492  1 2012-06-10     0.100
#2:    777492  2 2013-12-06     2.000
#3:     23433  4 2011-08-26     3.000
#4:    233989  6 2011-05-14     0.003
#5:    909978  8 2004-09-12     0.200
#6:    909978  9 2005-05-10     0.230
#7:   3987387 12 2014-08-26     1.170

注意： 'Dates' 在 join.

之前已更改为 'Date' class

数据

df1 <-  structure(list(StudentId = c(23433L, 233989L, 909978L,
777492L, 
3987387L), Date1 = c("2014-08-26", "2014-08-18", "2014-08-06", 
"2014-08-11", "2014-08-26"), Lunch = c("Yes", "No", "No", "Yes", 
"No")), .Names = c("StudentId", "Date1", "Lunch"), 
class = "data.frame", row.names = c(NA, -5L))

df2 <-  structure(list(Id = 1:12, StudentId = c(777492L, 777492L, 
777492L, 
23433L, 23433L, 233989L, 233989L, 909978L, 909978L, 909978L, 
3987387L, 3987387L), Date2 = c("2012.06.10", "2013.12.06", 
"2014.08.30", 
"2011.08.26", "2015.04.06", "2011.05.14", "2014.09.14", "2004-09-12", 
"2005-05-10", "2015-01-02", "2014-10-06", "2014-08-26"), 
Result_Nm = c(0.1, 
2, 0.6, 3, 3, 0.003, 0.05, 0.2, 0.23, 2.4, 3.5, 1.17)),
.Names = c("Id", 
"StudentId", "Date2", "Result_Nm"), class = "data.frame", 
row.names = c(NA, -12L))

Answer 2

这是我的解决方案。

d1 <- read.table("d1.txt", header=T)
d2 <- read.table("d2.txt", header=T)
d3 <- merge(d1,d2, by="StudentId")
d3$Date1 <- strptime(d3$Date1, format="%Y-%m-%d")
library(stringr)
d3$Date2 <-str_c(str_sub(d3$Date2, 1L,4L),"-",str_sub(d3$Date2,6L,7L), "-", str_sub(d3$Date2,9L,10L))
d3$Date2 <- strptime(d3$Date2, format="%Y-%m-%d")
d3$mark <- d3$Date2-d3$Date1
d3 <- d3[d3$mark<1,]
d3 <- d3[,c(1,4,5,6)]

基于引用中日期的子集 table

subset based on date in a reference table

r

subset

plyr

data.table

数据