根据日期间隔左连接列的子集

Question

我有两个数据框：df1 - 2 个不同的 MRI 阅读器对主题 1 的一系列 MRI 读取； df2 - 受试者 1 的类固醇使用列表

我正在尝试根据 df1$DOS 是否包含在 lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE) 中执行 df2$STATUS 到 df1 的左连接。如果 df1$DOS 没有可用的 df2$STATUS，我希望值为 NA。

例如，最终的 df 应该看起来像这样，因为 df1$DOS 落在第一行 lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE):

输入示例

df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691, 
    15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 
    16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 
    17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 
    18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747, 
    15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678, 
    16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686, 
    17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113, 
    18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04", 
    "1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02", 
    "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06", 
    "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10", 
    "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14", 
    "Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17", 
    "Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16", 
    "Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21", 
    "Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25", 
    "1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01", 
    "Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", 
    "Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", 
    "Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", 
    "Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16", 
    "Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04", 
    "2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19", 
    "Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23", 
    "Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613, 
    2, 3.58064516129032, 5.41935483870968, 7.25806451612903, 
    9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871, 
    20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452, 
    35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097, 
    57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6, 
    69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667, 
    75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286, 
    82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355, 
    88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032, 
    5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333, 
    14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484, 
    27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548, 
    45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667, 
    62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333, 
    72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097, 
    79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032, 
    85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0, 
    28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824, 
    953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119, 
    2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610, 
    2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541, 
    632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 
    2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 
    2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06, 
    1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88, 
    500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39, 
    304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86, 
    2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53, 
    5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55, 
    2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37, 
    142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33, 
    86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43, 
    2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69, 
    5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84, 
    1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88, 
    348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24, 
    1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9, 
    62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476, 
    0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941, 
    -0.174163873550192, -0.280284394093129, -0.421406547932979, 
    -0.38843409819973, 0.435335932125659, 0.194106856225636, 
    0.169227241458381, 0.173927998165558, 0.0385519376289841, 
    -0.0559791332263242, -0.00734780628510708, -0.182118496314196, 
    0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291, 
    1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846, 
    10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446, 
    9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282, 
    23.5726092972811, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.189671158363209, -0.581315076852664, -0.0844910043270326, 
    -0.311592039800995, -0.561429500614295, 0.172777457361786, 
    -0.399357337068468, 2.63401920438957, -0.145816186556927, 
    0.171671752047535, 0.106632407258712, -0.171992933997109, 
    3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164, 
    22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384, 
    40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393, 
    76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
    ), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476, 
    0.042557111274871, 0.0373715809094456, 0.134758767176731, 
    0.024350860462092, -0.257699726906238, -0.465754909185487, 
    -0.690889288655772, -0.810958429060644, -0.728661840565261, 
    -0.77426416402965, -0.778967445489618, -0.77807880705709, 
    -0.803670510208505, -0.821540812345572, -0.822852095886254, 
    -0.855114005808661, -0.835168407820018, -0.552901079370584, 
    -0.783546100827951, -0.762581819758117, -0.566073518574711, 
    0.0552906497897611, 0.277612813732715, 0.60386774459231, 
    0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902, 
    0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399, 
    2.56022692791192, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.237581511664256, -0.680786873805041, -0.707757511431625, 
    -0.798817944561164, -0.911767485478747, -0.896522895963128, 
    -0.947003787520809, -0.807410746090711, -0.954731493126486, 
    -0.946960169238934, -0.949904403265555, -0.96251735644133, 
    -0.817290286935598, -0.93709516782134, -0.902236890888868, 
    -0.754639894734546, -0.120931686500869, -0.158072653518178, 
    -0.153012932820576, 0.768306956388988, 0.556678322440879, 
    1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562, 
    2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA, 
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L, 
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    .rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"), 
    ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO", 
    "PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
    ), label = "Route", format.sas = "$"), START_DT = structure(c(15455, 
    15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061, 
    18079, 18115, 18122, 18129, 18136, 18142), class = "Date"), 
    STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472, 
    15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135, 
    18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0, 
    0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30, 
    8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0, 
    1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE", 
    "INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA, 
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
    1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl", 
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"))

example  = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1", 
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"), 
    VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52, 
    SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52, 
    PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"), 
    STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"))

Answer 1

这是一个使用 data.table 包中的 foverlaps 的解决方案（而 dplyr 只是为了一些清洁）。基本上我们通过为开始和结束日期设置相同的值来将 DOS 变量视为一个间隔，然后我们通过重叠间隔加入。

另一种解决方案是通过 ID full_join 然后例如使用 lubridate 中的 %within% 但这不会保留没有匹配项的行。

请注意，我们不能在 df2 中包含那些具有开始或结束变量的 NA 日期的行。因此，这些在加入之前被删除。

library(data.table)
library(dplyr)

dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>% 
                       select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
                       filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>% 
  select(-i.start, -i.end)

`Tidyverse`解决方案

这里我们应用一个完整连接和过滤器来查找所有匹配项，然后使用原始 df1 数据集进行一个完整连接以保留完全没有匹配项的行。

library(lubridate)

df3 <- df1 %>% 
  full_join(df1 %>%
              full_join(df2 %>% 
                          select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
              filter(DOS %within% interval(START_DT, STOP_DT)))

根据日期间隔左连接列的子集

Left Join Subset of Column Based on Date Interval

r

left-join

lubridate

`Tidyverse`解决方案

根据日期间隔左连接列的子集

Left Join Subset of Column Based on Date Interval

r

left-join

lubridate

Tidyverse解决方案

`Tidyverse`解决方案