查找时间序列数据的指定间隔中的最小值
Finding minimums in specified intervals for time series data
我有一组实验室值,我想根据入院日期随时间变化趋势。每个患者都有此 lab/follow 正常运行时间的可变条目。我的目标是在他们入院后的不同时间间隔(date_one 在 df 中确定该实验室的最小值,即第 0-30 天、第 31-90 天、1-2 年、2-3、3- 4 等,直到他们最后一次跟进,以帮助我识别超出其基线某个阈值的异常值。由于这个实验室值会随着时间自然变化,我想找到这些最小值来建立新的基线。由于每个患者都有不同的随访,有些长达 20 年,我很难找到一个函数来找到没有使用过滤和变异的局部最小值,以便为我想要的每个间隔创建一个新列。我的 dput 输出如下,如果格式不正确请告诉我!
structure(list(lab_date = structure(c(10006, 10007, 10008, 10009,
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019,
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225,
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858,
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199,
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226,
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281,
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401,
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663,
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698,
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733,
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1,
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1,
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2,
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1,
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2,
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1,
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2,
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4,
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
这样的事情怎么样?它让您将不同的段中断指定为天(可以轻松地将其转换为月或其他内容,但必须更改其他代码),然后对于这些段中的每一个,隔离这些中断范围内的行,然后找到其中的最小值。如果这些日期中没有值,它将 return 为 NA。
这应该适用于您提供的数据,如果您想将其应用于具有多个 ID 的数据框,请告诉我,这应该只是一个额外的小循环。
#Convert object to dataframe
Data=data.frame(structure(list(lab_date = structure(c(10006, 10007, 10008, 10009,
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019,
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225,
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858,
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199,
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226,
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281,
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401,
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663,
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698,
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733,
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1,
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1,
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2,
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1,
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2,
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1,
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2,
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4,
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE)))
#Define Segment Breaks in days
SegmentBreaks=c(0,30,90,365,730)
#Function for finding min date
MinAtSegments=function(Data,SegmentBreaks){
IDNumber=length(unique(Data$ID))
UniqueIDs=unique(Data$ID)
OutputLength=length(SegmentBreaks)
Date1=min(Data$lab_date)
DateBreaks=Date1+SegmentBreaks
Output=matrix(NA,nrow=IDNumber,ncol=length(SegmentBreaks))
DateBreaks=c(DateBreaks,Sys.Date())
for(j in 1:IDNumber){
DataID=Data[Data$ID==UniqueIDs[j],]
for(i in 1:length(Output)){
LabVals=Data$lab_value[Data$lab_date>=DateBreaks[i] & Data$lab_date<DateBreaks[i+1]]
Output[j,i]=ifelse(length(LabVals)>0,min(LabVals),NA)
}
}
Output=data.frame('ID'=UniqueIDs,'MinVals'=Output)
return(Output)
}
#Run Function
MinAtSegments(Data,SegmentBreaks)
这是 tidyverse
的一个可能选项(但我不确定您希望输出的格式是什么):
library(tidyverse)
df %>%
group_by(ID, Date_One) %>%
mutate(years = as.numeric(difftime(Date_One,lab_date,units = "days")/365)) %>%
filter(years >= 0) %>%
group_by(gr=cut(years, breaks= c(-Inf, c((30/365), (60/365)), seq(1, 20, by = 1))), ID) %>%
summarise(lab_value = min(lab_value))
输出
gr ID lab_value
<fct> <dbl> <dbl>
1 (-Inf,0.0822] 182 1.2
2 (0.164,1] 182 1.2
3 (1,2] 182 1.2
4 (2,3] 182 1
我有一组实验室值,我想根据入院日期随时间变化趋势。每个患者都有此 lab/follow 正常运行时间的可变条目。我的目标是在他们入院后的不同时间间隔(date_one 在 df 中确定该实验室的最小值,即第 0-30 天、第 31-90 天、1-2 年、2-3、3- 4 等,直到他们最后一次跟进,以帮助我识别超出其基线某个阈值的异常值。由于这个实验室值会随着时间自然变化,我想找到这些最小值来建立新的基线。由于每个患者都有不同的随访,有些长达 20 年,我很难找到一个函数来找到没有使用过滤和变异的局部最小值,以便为我想要的每个间隔创建一个新列。我的 dput 输出如下,如果格式不正确请告诉我!
structure(list(lab_date = structure(c(10006, 10007, 10008, 10009,
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019,
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225,
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858,
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199,
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226,
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281,
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401,
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663,
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698,
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733,
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1,
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1,
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2,
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1,
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2,
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1,
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2,
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4,
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
这样的事情怎么样?它让您将不同的段中断指定为天(可以轻松地将其转换为月或其他内容,但必须更改其他代码),然后对于这些段中的每一个,隔离这些中断范围内的行,然后找到其中的最小值。如果这些日期中没有值,它将 return 为 NA。 这应该适用于您提供的数据,如果您想将其应用于具有多个 ID 的数据框,请告诉我,这应该只是一个额外的小循环。
#Convert object to dataframe
Data=data.frame(structure(list(lab_date = structure(c(10006, 10007, 10008, 10009,
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019,
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225,
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858,
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199,
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226,
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281,
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401,
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663,
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698,
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733,
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1,
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1,
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2,
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1,
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2,
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1,
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2,
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4,
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE)))
#Define Segment Breaks in days
SegmentBreaks=c(0,30,90,365,730)
#Function for finding min date
MinAtSegments=function(Data,SegmentBreaks){
IDNumber=length(unique(Data$ID))
UniqueIDs=unique(Data$ID)
OutputLength=length(SegmentBreaks)
Date1=min(Data$lab_date)
DateBreaks=Date1+SegmentBreaks
Output=matrix(NA,nrow=IDNumber,ncol=length(SegmentBreaks))
DateBreaks=c(DateBreaks,Sys.Date())
for(j in 1:IDNumber){
DataID=Data[Data$ID==UniqueIDs[j],]
for(i in 1:length(Output)){
LabVals=Data$lab_value[Data$lab_date>=DateBreaks[i] & Data$lab_date<DateBreaks[i+1]]
Output[j,i]=ifelse(length(LabVals)>0,min(LabVals),NA)
}
}
Output=data.frame('ID'=UniqueIDs,'MinVals'=Output)
return(Output)
}
#Run Function
MinAtSegments(Data,SegmentBreaks)
这是 tidyverse
的一个可能选项(但我不确定您希望输出的格式是什么):
library(tidyverse)
df %>%
group_by(ID, Date_One) %>%
mutate(years = as.numeric(difftime(Date_One,lab_date,units = "days")/365)) %>%
filter(years >= 0) %>%
group_by(gr=cut(years, breaks= c(-Inf, c((30/365), (60/365)), seq(1, 20, by = 1))), ID) %>%
summarise(lab_value = min(lab_value))
输出
gr ID lab_value
<fct> <dbl> <dbl>
1 (-Inf,0.0822] 182 1.2
2 (0.164,1] 182 1.2
3 (1,2] 182 1.2
4 (2,3] 182 1