查找时间序列数据的指定间隔中的最小值

Question

我有一组实验室值，我想根据入院日期随时间变化趋势。每个患者都有此 lab/follow 正常运行时间的可变条目。我的目标是在他们入院后的不同时间间隔（date_one 在 df 中确定该实验室的最小值，即第 0-30 天、第 31-90 天、1-2 年、2-3、3- 4 等，直到他们最后一次跟进，以帮助我识别超出其基线某个阈值的异常值。由于这个实验室值会随着时间自然变化，我想找到这些最小值来建立新的基线。由于每个患者都有不同的随访，有些长达 20 年，我很难找到一个函数来找到没有使用过滤和变异的局部最小值，以便为我想要的每个间隔创建一个新列。我的 dput 输出如下，如果格式不正确请告诉我！

structure(list(lab_date = structure(c(10006, 10007, 10008, 10009, 
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019, 
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225, 
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858, 
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199, 
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226, 
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281, 
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401, 
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663, 
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698, 
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733, 
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1, 
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1, 
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2, 
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1, 
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2, 
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1, 
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2, 
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4, 
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
    ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

Answer 1

这样的事情怎么样？它让您将不同的段中断指定为天（可以轻松地将其转换为月或其他内容，但必须更改其他代码），然后对于这些段中的每一个，隔离这些中断范围内的行，然后找到其中的最小值。如果这些日期中没有值，它将 return 为 NA。这应该适用于您提供的数据，如果您想将其应用于具有多个 ID 的数据框，请告诉我，这应该只是一个额外的小循环。

#Convert object to dataframe
Data=data.frame(structure(list(lab_date = structure(c(10006, 10007, 10008, 10009, 
                                                      10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019, 
                                                      10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225, 
                                                      10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858, 
                                                      10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199, 
                                                      11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226, 
                                                      11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281, 
                                                      11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401, 
                                                      11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663, 
                                                      11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698, 
                                                      11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733, 
                                                      11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1, 
                                                                                                                                1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1, 
                                                                                                                                1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2, 
                                                                                                                                1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1, 
                                                                                                                                2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2, 
                                                                                                                                2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1, 
                                                                                                                                2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2, 
                                                                                                                                2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4, 
                                                                                                                                3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                                                                                          182, 182), Date_One = structure(c(10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                                                                                            10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df", 
                                                                                                                                                                                                                                                                   "tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
                                                                                                                                                                                                                                                                     ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of", 
                                                                                                                                                                                                                                                                                                                                            "vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                                                                                                                                       "tbl", "data.frame"), .drop = TRUE)))
#Define Segment Breaks in days
SegmentBreaks=c(0,30,90,365,730)

#Function for finding min date

MinAtSegments=function(Data,SegmentBreaks){
  IDNumber=length(unique(Data$ID))
  UniqueIDs=unique(Data$ID)
  OutputLength=length(SegmentBreaks)
  
  Date1=min(Data$lab_date)
  
  DateBreaks=Date1+SegmentBreaks
  
  Output=matrix(NA,nrow=IDNumber,ncol=length(SegmentBreaks))
  
  DateBreaks=c(DateBreaks,Sys.Date())
  for(j in 1:IDNumber){
    DataID=Data[Data$ID==UniqueIDs[j],]
  for(i in 1:length(Output)){
    LabVals=Data$lab_value[Data$lab_date>=DateBreaks[i] & Data$lab_date<DateBreaks[i+1]]
    Output[j,i]=ifelse(length(LabVals)>0,min(LabVals),NA)
  }
  }

  Output=data.frame('ID'=UniqueIDs,'MinVals'=Output)
  return(Output)
}

#Run Function
MinAtSegments(Data,SegmentBreaks)

Answer 2

这是 tidyverse 的一个可能选项（但我不确定您希望输出的格式是什么）：

library(tidyverse)

df %>% 
  group_by(ID, Date_One) %>% 
  mutate(years = as.numeric(difftime(Date_One,lab_date,units = "days")/365)) %>% 
  filter(years >= 0) %>% 
  group_by(gr=cut(years, breaks= c(-Inf, c((30/365), (60/365)), seq(1, 20, by = 1))), ID) %>% 
  summarise(lab_value = min(lab_value))

输出

  gr               ID lab_value
  <fct>         <dbl>     <dbl>
1 (-Inf,0.0822]   182       1.2
2 (0.164,1]       182       1.2
3 (1,2]           182       1.2
4 (2,3]           182       1

查找时间序列数据的指定间隔中的最小值

Finding minimums in specified intervals for time series data

r

time-series

outliers

lubridate