通过 ID 号确定重叠日期范围的存在和范围 - 两个数据框
Determine presence and extent of overlapping date ranges by ID number - two data frames
我有两个数据框如下。它们的长度不等:
library(lubridate)
id <- c(1, 2, 2, 2, 2, 3, 4, 4, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9,
10, 10, 10, 11, 11, 12, 13, 14, 15, 15, 5451396, 5451396, 5451396, 5451396, 5451396)
admDt <- ymd(c("2000-02-24", "2000-04-30", "2000-06-06", "2001-01-29", "2004-06-10", "2001-05-21",
"2000-01-25", "2000-04-18", "2000-01-14", "1991-10-06", "1992-02-25", "2000-05-17",
"2003-06-06", "2009-02-16", "2000-01-23", "2000-03-10", "2000-04-05", "2000-06-16",
"2000-07-04", "2000-07-27", "2001-01-19", "2002-08-16", "2002-09-19", "2004-04-17",
"2005-08-02", "2005-09-21", "2006-07-10", "2000-02-24", "2000-05-05", "2000-08-29",
"2001-01-24", "2000-01-27", "2000-03-09", "2000-04-15", "2000-03-20", "2002-11-13",
"2000-06-28", "2000-07-02", "2000-06-13", "1999-12-27", "2008-09-10", "2000-04-09",
"2000-06-01", "2002-11-25", "2006-08-04", "2006-10-07"))
sepDt <- ymd(c("2000-02-25", "2000-05-25", "2000-06-06", "2001-02-15", "2004-07-12", "2001-06-01",
"2000-01-31", "2000-04-20", "2000-01-31", "1991-11-07", "1992-03-26", "2000-05-31",
"2003-06-17", "2009-02-23", "2000-03-06", "2000-03-17", "2000-04-06", "2000-06-28",
"2000-07-17", "2000-07-31", "2002-04-19", "2002-09-11", "2003-05-06", "2004-05-03",
"2005-08-31", "2006-05-29", "2009-06-19", "2000-03-09", "2000-05-06", "2000-09-12",
"2001-01-24", "2000-02-15", "2000-03-17", "2000-04-16", "2000-04-20", "2002-12-05",
"2000-07-27", "2000-08-15", "2000-06-22", "2000-02-12", "2008-09-17", "2000-05-26",
"2000-08-29", "2003-02-24", "2006-09-22", "2006-11-10"))
adm <- data.frame(id, admDt, sepDt)
id <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5451396)
birthDt <- ymd(c("1971-07-22", "1982-08-09", "1976-01-30", "1972-02-03", "1958-05-26", "1979-05-24",
"1971-11-03", "1980-02-05", "1978-06-08", "1969-10-14", "1962-01-01", "1977-03-09",
"1952-01-24", "1974-12-16", "1956-05-05", "1963-07-16"))
dxDt <- ymd(c("2000-02-24", "2000-04-30", "2000-03-03", "2000-01-31", "2000-06-20", "2000-12-13",
"2000-05-14", "2000-01-23", "2000-03-09", "2000-02-15", "2000-05-01", "2000-06-30",
"2000-08-15", "2000-06-22", "2000-01-27", "2000-06-01"))
admPreDx <- c("No", "No", "No", "Yes", "No", "No", "No", "No", "Yes", "Yes","Yes", "Yes", "Yes",
"Yes", "Yes", "Yes")
admPreDxNbr <- c(0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)
admPreDxDur <- c(0, 0, 0, 6, 0, 0, 0, 0, 14, 19, 20, 2, 31, 9, 31, 25)
admPostDx <- c("Yes", "Yes", "No", "No", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes", "No",
"No", "Yes", "Yes")
admPostDxNbr <- c(1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 3)
admPostDxDur <- c(1, 25, 0, 0, 0, 0, 14, 31, 0, 6, 0, 27, 0, 0, 16, 31)
admDx <- data.frame(id, birthDt, dxDt, admPreDx, admPreDxNbr, admPreDxDur, admPostDx, admPostDxNbr,
admPostDxDur)
> head(adm)
id admDt sepDt
1 1 2000-02-24 2000-02-25
2 2 2000-04-30 2000-05-25
3 2 2000-06-06 2000-06-06
4 2 2001-01-29 2001-02-15
5 2 2004-06-10 2004-07-12
6 3 2001-05-21 2001-06-01
> head(admDx)
id birthDt dxDt admPreDx admPreDxNbr admPreDxDur admPostDx admPostDxNbr admPostDxDur
1 1 1971-07-22 2000-02-24 No 0 0 Yes 1 1
2 2 1982-08-09 2000-04-30 No 0 0 Yes 1 25
3 3 1976-01-30 2000-03-03 No 0 0 No 0 0
4 4 1972-02-03 2000-01-31 Yes 1 6 No 0 0
5 5 1958-05-26 2000-06-20 No 0 0 No 0 0
6 6 1979-05-24 2000-12-13 No 0 0 No 0 0
实际数据集的范围从 10,000 到 1,000,000 多行。
adm
中的每一行都指的是一次离散的住院治疗。注:id
为患者身份证号,admDt
、sepDt
分别为入院、出院日期。有些病人有多次入院。
admDx
中每一行代表一个病人:id
是病人的身份证号(与adm
中提供的一致),而birthDt
和dxDt
分别是患者的出生日期和诊断日期。
我正在进行一些纵向/时间序列分析,并想确定患者在诊断前和 post 的不同时间段内是否住院。为了简洁起见,这个问题是关于诊断前后一个月的问题。理想情况下,我想:
- 创建一个二分变量 ("Yes" / "No"),表示给定患者是否在该时间段内住院(即,我不担心他们是否在发病前入院时间段的时间或者如果他们在时间段的偏移之后出院)
- 计算每个患者在时间段内住院的次数
- 计算每个患者在该时间段内住院的持续时间(天数)
我在几天内审查了一些 post(例如,R Time periods overlapping, , );然而,none 似乎结合了我感兴趣的三个方面(计算重叠日期之间的时间;多个数据帧;"group" [或个人])。
我是 R 的新手,对循环和更高级的公式几乎没有经验。似乎可以使用 "DescTools"
包中的 foverlaps
、lubridate
或 %overlaps%
;但是,我不确定如何构造相关公式。
如有任何帮助,我们将不胜感激!
编辑#1:
虽然@sirallen 的建议在所提供示例中的特定时间段内有效,但 sum(pmin(dxDt, sepDt) - pmax(admDt, dxDt)), by = "id"
在我的真实数据集中返回了不准确的值(例如,一天内多次入院的患者[“2000-01-25 " - "2000-01-26"] 据报道在医院度过了零天。这对我来说似乎很奇怪,因为代码似乎被用来回答类似的例子。这个问题是否与我这些患者有几个重叠的日期范围? 此外,正如@sirallen 所指出的,该代码没有突出显示患者在该时间段内有一次或多次入院的情况。
下面的代码通过确定 a) 患者是否在医院度过时间和 b) 入院人数,为我的问题的前两部分提供了更直接的途径:
library(data.table)
setDT(adm)
setDT(admDx)[, (4:9) := NULL]
#Period bounds
admDx[, `:=`(dxDtN1 = dxDt %m-% months(1), dxDtP1 = dxDt %m+% months(1))]
#Hospitalised in the month preceding diagnosis
admDx <- adm[admDx, on = .(id, admDt < dxDt, sepDt > dxDtN1), .N, by = .EACHI]
admDx[, `:=` (admPreDx = factor(ifelse(N > 0, "Yes", "No")))]
但是,pmin / pmax 代码仍然不起作用,返回负值:
admDx[, `:=` (birthDt = birthDt, dxDt = dxDt, dxDtN1 = dxDt %m-% months(1), dxDtP1 = dxDt %m+% months(1))]
admDx[, `:=` (admPreDxDur=as.numeric(sum(pmin(dxDt, adm$sepDt) - pmax(dxDtN1, adm$admDt)))), by = "id"]
admDx <- select(admDx, admPreDx, N, admPreDxDur)
> head(admDx)
admPreDx N admPreDxDur
1: No 0 -28573
2: No 0 -27160
3: No 0 -28366
4: Yes 1 -29357
5: No 0 -26701
6: No 0 -28044
编辑#2
在测试其他情况后,似乎问题 re: pmin / pmax 可能与 >
与 >=
的使用有关:当使用 >
时,正确的 Dur
返回值;但是,当使用 >=
时,Dur
returns 值为 0。
如何调整此代码,使我能够计算截至诊断日期(包括诊断日期)的天数?
我们可以使用 data.table
(>=v1.9.8) 中的 non-equi joins 来做到这一点:
library(data.table)
setDT(adm)
setDT(admDx)[, (4:9):= NULL]
# period bounds
admDx[, `:=`(dxDtLo=dxDt-31, dxDtHi=dxDt+31)]
# hospitalized pre-dxnosis?
admDx = adm[, .(id, admDt, sepDt, dxDt=admDt, dxDtLo=sepDt)][admDx,
on=.(id, dxDt < dxDt, dxDtLo > dxDtLo)]
admDx[, admPreDx:= as.numeric(!is.na(admDt))]
admDx[, `:=`(admPreDxNbr=sum(admPreDx), admPreDxDur=as.numeric(
sum(pmin(dxDt,sepDt) - pmax(admDt,dxDtLo)))), by='id']
admDx[, c('admDt','sepDt'):= NULL]
# hospitalized post-dxnosis?
admDx = adm[, .(id, admDt, sepDt, dxDtHi=admDt, dxDt=sepDt)][admDx,
on=.(id, dxDtHi < dxDtHi, dxDt > dxDt)]
admDx[, admPostDx:= as.numeric(!is.na(admDt))]
admDx[, `:=`(admPostDxNbr=sum(admPostDx), admPostDxDur=as.numeric(
sum(pmin(sepDt,dxDtHi) - pmax(dxDt,admDt)))), by='id']
admDx[, c('admDt','sepDt'):= NULL]
admDx[is.na(admDx)] = 0
admDx = unique(admDx)[, c('dxDtLo','dxDtHi'):= NULL]
> admDx
# id dxDt birthDt admPreDx admPreDxNbr admPreDxDur admPostDx admPostDxNbr admPostDxDur
# 1: 1 2000-02-24 1971-07-22 0 0 0 1 1 1
# 2: 2 2000-04-30 1982-08-09 0 0 0 1 1 25
# 3: 3 2000-03-03 1976-01-30 0 0 0 0 0 0
# 4: 4 2000-01-31 1972-02-03 1 1 6 0 0 0
# 5: 5 2000-06-20 1958-05-26 0 0 0 0 0 0
# 6: 6 2000-12-13 1979-05-24 0 0 0 0 0 0
# 7: 7 2000-05-14 1971-11-03 0 0 0 1 1 14
# 8: 8 2000-01-23 1980-02-05 0 0 0 1 1 31
# 9: 9 2000-03-09 1978-06-08 1 1 14 0 0 0
# 10: 10 2000-02-15 1969-10-14 1 1 19 1 1 8
# 11: 11 2000-05-01 1962-01-01 1 1 20 0 0 0
# 12: 12 2000-06-30 1977-03-09 1 1 2 1 1 27
# 13: 13 2000-08-15 1952-01-24 1 1 31 0 0 0
# 14: 14 2000-06-22 1974-12-16 1 1 9 0 0 0
# 15: 15 2000-01-27 1956-05-05 1 1 31 1 1 16
# 16: 5451396 2000-06-01 1963-07-16 1 1 25 1 1 31
我有两个数据框如下。它们的长度不等:
library(lubridate)
id <- c(1, 2, 2, 2, 2, 3, 4, 4, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9,
10, 10, 10, 11, 11, 12, 13, 14, 15, 15, 5451396, 5451396, 5451396, 5451396, 5451396)
admDt <- ymd(c("2000-02-24", "2000-04-30", "2000-06-06", "2001-01-29", "2004-06-10", "2001-05-21",
"2000-01-25", "2000-04-18", "2000-01-14", "1991-10-06", "1992-02-25", "2000-05-17",
"2003-06-06", "2009-02-16", "2000-01-23", "2000-03-10", "2000-04-05", "2000-06-16",
"2000-07-04", "2000-07-27", "2001-01-19", "2002-08-16", "2002-09-19", "2004-04-17",
"2005-08-02", "2005-09-21", "2006-07-10", "2000-02-24", "2000-05-05", "2000-08-29",
"2001-01-24", "2000-01-27", "2000-03-09", "2000-04-15", "2000-03-20", "2002-11-13",
"2000-06-28", "2000-07-02", "2000-06-13", "1999-12-27", "2008-09-10", "2000-04-09",
"2000-06-01", "2002-11-25", "2006-08-04", "2006-10-07"))
sepDt <- ymd(c("2000-02-25", "2000-05-25", "2000-06-06", "2001-02-15", "2004-07-12", "2001-06-01",
"2000-01-31", "2000-04-20", "2000-01-31", "1991-11-07", "1992-03-26", "2000-05-31",
"2003-06-17", "2009-02-23", "2000-03-06", "2000-03-17", "2000-04-06", "2000-06-28",
"2000-07-17", "2000-07-31", "2002-04-19", "2002-09-11", "2003-05-06", "2004-05-03",
"2005-08-31", "2006-05-29", "2009-06-19", "2000-03-09", "2000-05-06", "2000-09-12",
"2001-01-24", "2000-02-15", "2000-03-17", "2000-04-16", "2000-04-20", "2002-12-05",
"2000-07-27", "2000-08-15", "2000-06-22", "2000-02-12", "2008-09-17", "2000-05-26",
"2000-08-29", "2003-02-24", "2006-09-22", "2006-11-10"))
adm <- data.frame(id, admDt, sepDt)
id <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5451396)
birthDt <- ymd(c("1971-07-22", "1982-08-09", "1976-01-30", "1972-02-03", "1958-05-26", "1979-05-24",
"1971-11-03", "1980-02-05", "1978-06-08", "1969-10-14", "1962-01-01", "1977-03-09",
"1952-01-24", "1974-12-16", "1956-05-05", "1963-07-16"))
dxDt <- ymd(c("2000-02-24", "2000-04-30", "2000-03-03", "2000-01-31", "2000-06-20", "2000-12-13",
"2000-05-14", "2000-01-23", "2000-03-09", "2000-02-15", "2000-05-01", "2000-06-30",
"2000-08-15", "2000-06-22", "2000-01-27", "2000-06-01"))
admPreDx <- c("No", "No", "No", "Yes", "No", "No", "No", "No", "Yes", "Yes","Yes", "Yes", "Yes",
"Yes", "Yes", "Yes")
admPreDxNbr <- c(0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)
admPreDxDur <- c(0, 0, 0, 6, 0, 0, 0, 0, 14, 19, 20, 2, 31, 9, 31, 25)
admPostDx <- c("Yes", "Yes", "No", "No", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes", "No",
"No", "Yes", "Yes")
admPostDxNbr <- c(1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 3)
admPostDxDur <- c(1, 25, 0, 0, 0, 0, 14, 31, 0, 6, 0, 27, 0, 0, 16, 31)
admDx <- data.frame(id, birthDt, dxDt, admPreDx, admPreDxNbr, admPreDxDur, admPostDx, admPostDxNbr,
admPostDxDur)
> head(adm)
id admDt sepDt
1 1 2000-02-24 2000-02-25
2 2 2000-04-30 2000-05-25
3 2 2000-06-06 2000-06-06
4 2 2001-01-29 2001-02-15
5 2 2004-06-10 2004-07-12
6 3 2001-05-21 2001-06-01
> head(admDx)
id birthDt dxDt admPreDx admPreDxNbr admPreDxDur admPostDx admPostDxNbr admPostDxDur
1 1 1971-07-22 2000-02-24 No 0 0 Yes 1 1
2 2 1982-08-09 2000-04-30 No 0 0 Yes 1 25
3 3 1976-01-30 2000-03-03 No 0 0 No 0 0
4 4 1972-02-03 2000-01-31 Yes 1 6 No 0 0
5 5 1958-05-26 2000-06-20 No 0 0 No 0 0
6 6 1979-05-24 2000-12-13 No 0 0 No 0 0
实际数据集的范围从 10,000 到 1,000,000 多行。
adm
中的每一行都指的是一次离散的住院治疗。注:id
为患者身份证号,admDt
、sepDt
分别为入院、出院日期。有些病人有多次入院。
admDx
中每一行代表一个病人:id
是病人的身份证号(与adm
中提供的一致),而birthDt
和dxDt
分别是患者的出生日期和诊断日期。
我正在进行一些纵向/时间序列分析,并想确定患者在诊断前和 post 的不同时间段内是否住院。为了简洁起见,这个问题是关于诊断前后一个月的问题。理想情况下,我想:
- 创建一个二分变量 ("Yes" / "No"),表示给定患者是否在该时间段内住院(即,我不担心他们是否在发病前入院时间段的时间或者如果他们在时间段的偏移之后出院)
- 计算每个患者在时间段内住院的次数
- 计算每个患者在该时间段内住院的持续时间(天数)
我在几天内审查了一些 post(例如,R Time periods overlapping,
我是 R 的新手,对循环和更高级的公式几乎没有经验。似乎可以使用 "DescTools"
包中的 foverlaps
、lubridate
或 %overlaps%
;但是,我不确定如何构造相关公式。
如有任何帮助,我们将不胜感激!
编辑#1:
虽然@sirallen 的建议在所提供示例中的特定时间段内有效,但 sum(pmin(dxDt, sepDt) - pmax(admDt, dxDt)), by = "id"
在我的真实数据集中返回了不准确的值(例如,一天内多次入院的患者[“2000-01-25 " - "2000-01-26"] 据报道在医院度过了零天。这对我来说似乎很奇怪,因为代码似乎被用来回答类似的例子。这个问题是否与我这些患者有几个重叠的日期范围? 此外,正如@sirallen 所指出的,该代码没有突出显示患者在该时间段内有一次或多次入院的情况。
下面的代码通过确定 a) 患者是否在医院度过时间和 b) 入院人数,为我的问题的前两部分提供了更直接的途径:
library(data.table)
setDT(adm)
setDT(admDx)[, (4:9) := NULL]
#Period bounds
admDx[, `:=`(dxDtN1 = dxDt %m-% months(1), dxDtP1 = dxDt %m+% months(1))]
#Hospitalised in the month preceding diagnosis
admDx <- adm[admDx, on = .(id, admDt < dxDt, sepDt > dxDtN1), .N, by = .EACHI]
admDx[, `:=` (admPreDx = factor(ifelse(N > 0, "Yes", "No")))]
但是,pmin / pmax 代码仍然不起作用,返回负值:
admDx[, `:=` (birthDt = birthDt, dxDt = dxDt, dxDtN1 = dxDt %m-% months(1), dxDtP1 = dxDt %m+% months(1))]
admDx[, `:=` (admPreDxDur=as.numeric(sum(pmin(dxDt, adm$sepDt) - pmax(dxDtN1, adm$admDt)))), by = "id"]
admDx <- select(admDx, admPreDx, N, admPreDxDur)
> head(admDx)
admPreDx N admPreDxDur
1: No 0 -28573
2: No 0 -27160
3: No 0 -28366
4: Yes 1 -29357
5: No 0 -26701
6: No 0 -28044
编辑#2
在测试其他情况后,似乎问题 re: pmin / pmax 可能与 >
与 >=
的使用有关:当使用 >
时,正确的 Dur
返回值;但是,当使用 >=
时,Dur
returns 值为 0。
如何调整此代码,使我能够计算截至诊断日期(包括诊断日期)的天数?
我们可以使用 data.table
(>=v1.9.8) 中的 non-equi joins 来做到这一点:
library(data.table)
setDT(adm)
setDT(admDx)[, (4:9):= NULL]
# period bounds
admDx[, `:=`(dxDtLo=dxDt-31, dxDtHi=dxDt+31)]
# hospitalized pre-dxnosis?
admDx = adm[, .(id, admDt, sepDt, dxDt=admDt, dxDtLo=sepDt)][admDx,
on=.(id, dxDt < dxDt, dxDtLo > dxDtLo)]
admDx[, admPreDx:= as.numeric(!is.na(admDt))]
admDx[, `:=`(admPreDxNbr=sum(admPreDx), admPreDxDur=as.numeric(
sum(pmin(dxDt,sepDt) - pmax(admDt,dxDtLo)))), by='id']
admDx[, c('admDt','sepDt'):= NULL]
# hospitalized post-dxnosis?
admDx = adm[, .(id, admDt, sepDt, dxDtHi=admDt, dxDt=sepDt)][admDx,
on=.(id, dxDtHi < dxDtHi, dxDt > dxDt)]
admDx[, admPostDx:= as.numeric(!is.na(admDt))]
admDx[, `:=`(admPostDxNbr=sum(admPostDx), admPostDxDur=as.numeric(
sum(pmin(sepDt,dxDtHi) - pmax(dxDt,admDt)))), by='id']
admDx[, c('admDt','sepDt'):= NULL]
admDx[is.na(admDx)] = 0
admDx = unique(admDx)[, c('dxDtLo','dxDtHi'):= NULL]
> admDx
# id dxDt birthDt admPreDx admPreDxNbr admPreDxDur admPostDx admPostDxNbr admPostDxDur
# 1: 1 2000-02-24 1971-07-22 0 0 0 1 1 1
# 2: 2 2000-04-30 1982-08-09 0 0 0 1 1 25
# 3: 3 2000-03-03 1976-01-30 0 0 0 0 0 0
# 4: 4 2000-01-31 1972-02-03 1 1 6 0 0 0
# 5: 5 2000-06-20 1958-05-26 0 0 0 0 0 0
# 6: 6 2000-12-13 1979-05-24 0 0 0 0 0 0
# 7: 7 2000-05-14 1971-11-03 0 0 0 1 1 14
# 8: 8 2000-01-23 1980-02-05 0 0 0 1 1 31
# 9: 9 2000-03-09 1978-06-08 1 1 14 0 0 0
# 10: 10 2000-02-15 1969-10-14 1 1 19 1 1 8
# 11: 11 2000-05-01 1962-01-01 1 1 20 0 0 0
# 12: 12 2000-06-30 1977-03-09 1 1 2 1 1 27
# 13: 13 2000-08-15 1952-01-24 1 1 31 0 0 0
# 14: 14 2000-06-22 1974-12-16 1 1 9 0 0 0
# 15: 15 2000-01-27 1956-05-05 1 1 31 1 1 16
# 16: 5451396 2000-06-01 1963-07-16 1 1 25 1 1 31