计算 R 中每个 ID 每年的中位数并绘制结果

Question

数据集：

structure(list(ID = c(1234, 1234, 1234, 1234, 1234, 1234, 1234, 
1234, 8769, 8769, 8769, 8769, 8769, 7457, 7457, 7457, 7457, 7457, 
7457, 55667, 55667, 55667, 55667, 55667, 55667, 55667, 3789, 
3789, 3789, 3789, 3789, 3789), date_of_bloods = structure(c(978307200, 
981072000, 1173052800, 1175731200, 1367798400, 1465171200, 1467936000, 
1659916800, 1072915200, 1075680000, 1173052800, 1175731200, 1367798400, 
978307200, 981072000, 1173052800, 1175731200, 1367798400, 1465171200, 
978307200, 981072000, 1173052800, 1270425600, 1273104000, 1465171200, 
1467936000, 1270425600, 1367798400, 1465171200, 1465257600, 1465344000, 
1465430400), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    result = c(90, 80, 60, 40, 25, 22, 22, 21, 70, 65, 43, 23, 
    22, 90, 90, 88, 86, 76, 74, 58, 46, 35, 34, 33, 30, 24, 76, 
    67, 56, 34, 33, 23), `mutation type` = c(1, 1, 1, 1, 1, 1, 
    1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 
    3, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -32L), class = "data.frame")

我希望每个 ID 每年的结果中位数采用一种格式，其中年份仅为 0、1、2、3 等，以确保队列之间的一致性，然后绘制这些线并标明其突变类别。

我完成了：

filtered$date_of_bloods <-format(filtered$date_of_bloods,format="%Y")
#split into individual ID groups
a <- with(filtered, split(filtered, list(ID)))

#aggregate median results per year 
medianfunc <- function(y) {aggregate(results ~ date_of_bloods, data = y, median)}
medians <- sapply(a, medianfunc)

# do lm per ID cohort and get slope of lines 
g<- as.data.frame(medians)
coefLM <- function(x) {coef(lm(date_of_bloods ~ results, data = x))}
coefs<- sapply(g, coefLM)

实际年份并不重要，为了统一起见，我希望每个 ID 为 0、1、2、3、4 等。我不知道该怎么做？然后我想用某种形式的想法绘制这些数据（每个 ID 的年血液中位数），以了解它们属于哪个突变类别。

我希望这不是一个太宽泛的问题。

非常感谢

Answer 1

你的命名结构不清楚，如果你提供的数据叫df那么你可以这样做：

df$year <-format(df$date_of_bloods,format="%Y")
aggregate(result ~ year + ID, data = df, median)

   year    ID result
1  2001  1234   85.0
2  2007  1234   50.0
3  2013  1234   25.0
4  2016  1234   22.0
5  2022  1234   21.0
6  2010  3789   76.0
7  2013  3789   67.0
8  2016  3789   33.5
9  2001  7457   90.0
10 2007  7457   87.0
11 2013  7457   76.0
12 2016  7457   74.0
13 2004  8769   67.5
14 2007  8769   33.0
15 2013  8769   22.0
16 2001 55667   52.0
17 2007 55667   35.0
18 2010 55667   33.5
19 2016 55667   27.0

Answer 2

您可以试试这个（filtered 是您包含的 dput()）。希望对您有所帮助：

library(dplyr)
library(lubridate)
library(ggplot2)
library(broom)
#Data
filtered %>% mutate(year=year(date_of_bloods)) %>%
group_by(ID,year,`mutation type`) %>% summarise(med=median(result)) -> df1
#Variables
df1 %>% ungroup()%>% mutate(ID=as.factor(ID),
                            year=as.factor(year),
                            `mutation type`=as.factor(`mutation type`)) -> df1
#Plot
ggplot(df1,aes(x=ID,y=med,fill=`mutation type`,color=year,group=year))+
  geom_line()

对于模型：

#Models
fits <- df1 %>%group_by(ID) %>% 
  do(fitmodel = lm(med ~ year, data = .))
#Coefs
dfCoef = tidy(fits, fitmodel)


# A tibble: 10 x 6
# Groups:   ID [5]
      ID term        estimate std.error statistic p.value
   <dbl> <chr>          <dbl>     <dbl>     <dbl>   <dbl>
 1  1234 (Intercept)  6329.    1546.         4.09  0.0264
 2  1234 year           -3.13     0.769     -4.07  0.0268
 3  3789 (Intercept) 14318.    4746.         3.02  0.204 
 4  3789 year           -7.08     2.36      -3.00  0.205 
 5  7457 (Intercept)  2409.     403.         5.98  0.0269
 6  7457 year           -1.16     0.201     -5.78  0.0287
 7  8769 (Intercept)  9268.    4803.         1.93  0.304 
 8  8769 year           -4.60     2.39      -1.92  0.306 
 9 55667 (Intercept)  3294.     759.         4.34  0.0492
10 55667 year           -1.62     0.378     -4.29  0.0503

所需情节代码：

#Plot 2
#Data modifications
df1 %>% mutate(year2=as.numeric(year)-1) -> df2
df2 %>% mutate(year2=factor(year2,levels = sort(unique(year2)))) -> df2
#Plot 2
ggplot(df2,aes(x=year2,y=med,color=ID,group=ID))+
  facet_wrap(.~`mutation type`)+
  geom_line()

计算 R 中每个 ID 每年的中位数并绘制结果

calculating medians per year per ID in R and plotting the outcome

r

dataframe

lm