根据日期间隔左连接列的子集

Left Join Subset of Column Based on Date Interval

我有两个数据框:df1 - 2 个不同的 MRI 阅读器对主题 1 的一系列 MRI 读取; df2 - 受试者 1 的类固醇使用列表

我正在尝试根据 df1$DOS 是否包含在 lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE) 中执行 df2$STATUSdf1 的左连接。如果 df1$DOS 没有可用的 df2$STATUS,我希望值为 NA

例如,最终的 df 应该看起来像这样,因为 df1$DOS 落在第一行 lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE):

输入示例

df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691, 
    15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 
    16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 
    17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 
    18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747, 
    15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678, 
    16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686, 
    17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113, 
    18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04", 
    "1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02", 
    "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06", 
    "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10", 
    "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14", 
    "Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17", 
    "Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16", 
    "Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21", 
    "Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25", 
    "1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01", 
    "Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", 
    "Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", 
    "Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", 
    "Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16", 
    "Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04", 
    "2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19", 
    "Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23", 
    "Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613, 
    2, 3.58064516129032, 5.41935483870968, 7.25806451612903, 
    9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871, 
    20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452, 
    35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097, 
    57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6, 
    69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667, 
    75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286, 
    82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355, 
    88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032, 
    5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333, 
    14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484, 
    27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548, 
    45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667, 
    62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333, 
    72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097, 
    79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032, 
    85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0, 
    28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824, 
    953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119, 
    2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610, 
    2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541, 
    632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 
    2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 
    2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06, 
    1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88, 
    500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39, 
    304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86, 
    2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53, 
    5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55, 
    2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37, 
    142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33, 
    86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43, 
    2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69, 
    5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84, 
    1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88, 
    348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24, 
    1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9, 
    62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476, 
    0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941, 
    -0.174163873550192, -0.280284394093129, -0.421406547932979, 
    -0.38843409819973, 0.435335932125659, 0.194106856225636, 
    0.169227241458381, 0.173927998165558, 0.0385519376289841, 
    -0.0559791332263242, -0.00734780628510708, -0.182118496314196, 
    0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291, 
    1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846, 
    10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446, 
    9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282, 
    23.5726092972811, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.189671158363209, -0.581315076852664, -0.0844910043270326, 
    -0.311592039800995, -0.561429500614295, 0.172777457361786, 
    -0.399357337068468, 2.63401920438957, -0.145816186556927, 
    0.171671752047535, 0.106632407258712, -0.171992933997109, 
    3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164, 
    22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384, 
    40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393, 
    76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
    ), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476, 
    0.042557111274871, 0.0373715809094456, 0.134758767176731, 
    0.024350860462092, -0.257699726906238, -0.465754909185487, 
    -0.690889288655772, -0.810958429060644, -0.728661840565261, 
    -0.77426416402965, -0.778967445489618, -0.77807880705709, 
    -0.803670510208505, -0.821540812345572, -0.822852095886254, 
    -0.855114005808661, -0.835168407820018, -0.552901079370584, 
    -0.783546100827951, -0.762581819758117, -0.566073518574711, 
    0.0552906497897611, 0.277612813732715, 0.60386774459231, 
    0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902, 
    0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399, 
    2.56022692791192, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.237581511664256, -0.680786873805041, -0.707757511431625, 
    -0.798817944561164, -0.911767485478747, -0.896522895963128, 
    -0.947003787520809, -0.807410746090711, -0.954731493126486, 
    -0.946960169238934, -0.949904403265555, -0.96251735644133, 
    -0.817290286935598, -0.93709516782134, -0.902236890888868, 
    -0.754639894734546, -0.120931686500869, -0.158072653518178, 
    -0.153012932820576, 0.768306956388988, 0.556678322440879, 
    1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562, 
    2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA, 
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L, 
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    .rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"), 
    ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO", 
    "PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
    ), label = "Route", format.sas = "$"), START_DT = structure(c(15455, 
    15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061, 
    18079, 18115, 18122, 18129, 18136, 18142), class = "Date"), 
    STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472, 
    15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135, 
    18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0, 
    0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30, 
    8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0, 
    1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE", 
    "INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA, 
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
    1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl", 
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"))

example  = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1", 
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"), 
    VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52, 
    SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52, 
    PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"), 
    STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"))

这是一个使用 data.table 包中的 foverlaps 的解决方案(而 dplyr 只是为了一些清洁)。基本上我们通过为开始和结束日期设置相同的值来将 DOS 变量视为一个间隔,然后我们通过重叠间隔加入。

另一种解决方案是通过 ID full_join 然后例如使用 lubridate 中的 %within% 但这不会保留没有匹配项的行。

请注意,我们不能在 df2 中包含那些具有开始或结束变量的 NA 日期的行。因此,这些在加入之前被删除。

library(data.table)
library(dplyr)

dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>% 
                       select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
                       filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>% 
  select(-i.start, -i.end)

Tidyverse解决方案

这里我们应用一个完整连接和过滤器来查找所有匹配项,然后使用原始 df1 数据集进行一个完整连接以保留完全没有匹配项的行。

library(lubridate)

df3 <- df1 %>% 
  full_join(df1 %>%
              full_join(df2 %>% 
                          select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
              filter(DOS %within% interval(START_DT, STOP_DT)))