根据前几行中的数据值向数据集中的患者添加一行

Add a row to a patient in dataset based on value of data in preceding rows

我有一个这样构造的数据集:

Patient ID    Visit Date         Dead     Death Date    Sex    State
101           Feb/14             1          Jan/15      M      2
101           June/14            1          Jan/15      M      3 
101           December/14        1          Jan/15      M      2
102           Jan/14             0          N/A         M      1
102           April/14           0          N/A         M      1 

如果患者已经死亡,所有就诊都将标有 "Dead" 代码和死亡日期。

如果死亡代码为=1

我需要创建一行作为患者 101 的最后一次就诊

死亡日期在 "Visit Date Column"

和 "State" 变量表示 5(我数据集中的死亡状态代码)。

我想要的数据集如下所示(第4行数据是重要的):

Patient ID    Visit Date         Dead     Death Date    Sex    State
101           Feb/14             1          Jan/15      M      2
101           June/14            1          Jan/15      M      3 
101           December/14        1          Jan/15      M      2
101           Jan/15             1          Jan/15      M      5 
102           Jan/14             0          N/A         M      1
102           April/14           0          N/A         M      1   

您可以执行以下操作:

df <- read.table(header=T, text='Patient_ID    Visit_Date         Dead     Death_Date    Sex    State
101           Feb/14             1          Jan/15      M      2
101           June/14            1          Jan/15      M      3 
101           December/14        1          Jan/15      M      2
102           Jan/14             0          N/A         M      1
102           April/14           0          N/A         M      1 ', stringsAsFactors=F)

df$Patient_ID <- as.numeric(df$Patient_ID) #this needs to be numeric

df <- rbind(df, list(101, 'Jan/15', 1, 'Jan/15', 'M', 5 )) #use rbind to add a row

> df[order(df$Patient_ID),] #sort on Patient ID and the last row is inserted where it should
  Patient_ID  Visit_Date Dead Death_Date Sex State
1        101      Feb/14    1     Jan/15   M     2
2        101     June/14    1     Jan/15   M     3
3        101 December/14    1     Jan/15   M     2
6        101      Jan/15    1     Jan/15   M     5
4        102      Jan/14    0        N/A   M     1
5        102    April/14    0        N/A   M     1

因此,您唯一真正需要使用的是使用 rbind 函数,它在 data.frame 的末尾添加一行。将其用作 rbind( <your_data.frame> , <a vector with the values to add>). 在我们的例子中,<your data frame> 是 df,<a vector with the values to add>list(101, 'Jan/15', 1, 'Jan/15', 'M', 5 )

最好使用列表向量来添加行,因为这将确保您的 data.frame 的列类型保持不变。使用原子向量会将所有内容强制转换为字符。

一条数据table答案:

df <- read.table(header=T, text='Patient_ID    Visit_Date         Dead     Death_Date    Sex    State
101           Feb/14             1          Jan/15      M      2
101           June/14            1          Jan/15      M      3 
101           December/14        1          Jan/15      M      2
102           Jan/14             0          N/A         M      1
102           April/14           0          N/A         M      1 ', stringsAsFactors=F)

library(data.table)
DT <- as.data.table(df)
# take only the Patient_ID, Death indicator, Death date and sex 
dead <- unique(DT[ Death_Date != "N/A", c(1, 3, 4, 5), with = FALSE, ])

# move the death date to visited, assign '5' to state
dead[, c("Visit_Date", "State") := list(Death_Date, 5)  ]

# recombine with original records
records <- rbind(DT, dead)
records[ order(records$Patient_ID, as.Date(records$Visit_Date, format = "%b/%d")),]
   Patient_ID  Visit_Date Dead Death_Date Sex State
1:        101      Jan/15    1     Jan/15   M     5
2:        101      Feb/14    1     Jan/15   M     2
3:        101     June/14    1     Jan/15   M     3
4:        101 December/14    1     Jan/15   M     2
5:        102      Jan/14    0        N/A   M     1
6:        102    April/14    0        N/A   M     1 

发生了几件事。首先你应该使用 NA 而不是字符串。其次,您应该格式化这些日期,以便您可以使用它们(并按它们正确排序)。

dat <- read.table(header = TRUE, text = "ID    Visit         Dead     Death    Sex    State
101           Feb/14             1          Jan/15      M      2
101           June/14            1          Jan/15      M      3 
101           December/14        1          Jan/15      M      2
102           Jan/14             0          N/A         M      1
102           April/14           0          N/A         M      1 ",
                  na.strings = 'N/A')

## format dates helper
f_dt <- function(x) {
  x <- as.character(x)
  res <- sprintf('01/%s/%s', substr(x, 1, 3), gsub('\D', '', x))
  as.Date(res, '%d/%b/%y')
}

dat <- within(dat, {
  Visit <- f_dt(Visit)
  Death <- f_dt(Death)
})

## remove those not dead and take the last row
## assign values how you want
deaths <- dat[with(dat, !is.na(Death) & !duplicated(ID, fromLast = TRUE)), ]
deaths <- within(deaths, {
  Visit <- Death
  State <- 5
})

## combine everything and order
out <- rbind(dat, deaths)
out[with(out, order(ID, Visit)), ]

#     ID      Visit Dead      Death Sex State
# 1  101 2014-02-01    1 2015-01-01   M     2
# 2  101 2014-06-01    1 2015-01-01   M     3
# 3  101 2014-12-01    1 2015-01-01   M     2
# 31 101 2015-01-01    1 2015-01-01   M     5
# 4  102 2014-01-01    0       <NA>   M     1
# 5  102 2014-04-01    0       <NA>   M     1