计算不同日期之间的天数(过滤大table)

Calculation of days between different dates (filter big table)

我正在尝试从数据库中过滤一个大 table,以便它以自动方式为我工作。

目前的标准是: 按相同分组 name/ID 过滤名称中的特定语法 过滤另一个标准 和 filter/dismiss 个没有值的条目(R 中的 NA)

唯一缺少的两个选项如下 每个 ID 都有一个特定的日期,一个 ID 字段和一个 Passage 字段(P0,P1,...) 我如何附加一个新列,首先检查 ID,然后检查通道,然后获取 P0 的日期并将中间的天数作为输出?

ID1 ID2 Sample_ID   Type    Date    Passage colonies
abc-0001-P0 abc-0001    abc-0001-T  cells   3/22/16 P0  23
abc-0001-P1 abc-0001    abc-0001-T  cells   3/29/16 P1  30
abc-0002-P0 abc-0002    abc-0002-T  cells   4/03/16 P0  5
abc-0002-P1 abc-0002    abc-00021-T cells   5/05/16 P1  18
abc-0002-P2 abc-0002    abc-00021-T cells   5/19/16 P1  27

如果我将 abc-0001 样本的另一列作为输出,以及为 abc-0002 样本计算的引用相应 P0 日期的日期,那就太好了。

以及是否可以添加一个函数来创建天数和 colonie_count 的线图。 -->这个不如具体日期的计算重要

这是我当前的代码:

library(readxl)
library(stringr)
library(dplyr

b <- a %>%
    select(ID1, ID2, Sample_ID, Type, Date, Passage, Colonies) %>%
    group_by(ID2) %>%
    filter(str_detect(ID2, 'abc')) %>%
    filter(str_detect(ID1, 'cells')) %>%
    filter(!is.na(Passage))
write.csv(b, file="test.csv")

如果你能帮助我,那就太好了。

最佳,

丹尼斯


添加更多示例是我的起点和目标:

    ID1              ID2    Sample_ID   Type    Date    Passage  colonies
abc-0001-T-P0   abc-0001    abc-0001-T  cells   3/22/16 P0       23
abc-0001-T      abc-0001    abc-0001-T  frozen  3/22/16     
abc-0001-N      abc-0001    abc-0001-N  frozen  3/22/16     
abc-0001-P1     abc-0001    abc-0001-T  cells   3/29/16 P1       30
abc-0002-T-P0   abc-0002    abc-0002-T  frozen  4/03/16     
abc-0002-T-SFT  abc-0002    abc-0002-T  frozen  4/03/16     
abc-0002-N-SFT  abc-0002    abc-0002-N  cells   4/03/16 P0        5
abc-0002-P1     abc-0002    abc-00021-T cells   5/05/16 P1       18
abc-0002-P2     abc-0002    abc-00021-T cells   5/19/16 P1       27

我想讲这个:

        ID1      ID2    Sample_ID   Type    Date    Passage colonies     days
abc-0001-P0 abc-0001    abc-0001-T  cells   3/22/16 P0      23           0
abc-0001-P1 abc-0001    abc-0001-T  cells   3/29/16 P1      30           7
abc-0002-P0 abc-0002    abc-0002-T  cells   4/03/16 P0      5            0
abc-0002-P1 abc-0002    abc-00021-T cells   5/05/16 P1      18          32
abc-0002-P2 abc-0002    abc-00021-T cells   5/19/16 P1      27          46

从这个table我很想为 天数(x 轴)和菌落数(y 轴)每个地块的名称将是 Sample_ID.

如果可以的话就太好了。谢谢!!

像这样能做到吗,

tbl %>% group_by(ID2) %>%  filter(!is.na(Passage)) %>% 
    mutate(days = colonies- first(colonies),
           ID1 = paste0(str_extract(ID1, "^abc.\d+"), '-', Passage))
#> # A tibble: 5 x 8
#> # Groups:   ID2 [2]
#>           ID1      ID2   Sample_ID   Type    Date Passage colonies  days
#>         <chr>   <fctr>      <fctr> <fctr>  <fctr>  <fctr>    <int> <int>
#> 1 abc-0001-P0 abc-0001  abc-0001-T  cells 3/22/16      P0       23     0
#> 2 abc-0001-P1 abc-0001  abc-0001-T  cells 3/29/16      P1       30     7
#> 3 abc-0002-P0 abc-0002  abc-0002-N  cells 4/03/16      P0        5     0
#> 4 abc-0002-P1 abc-0002 abc-00021-T  cells 5/05/16      P1       18    13
#> 5 abc-0002-P1 abc-0002 abc-00021-T  cells 5/19/16      P1       27    22

这是数据,以防其他人想试一试;

tbl <- structure(list(ID1 = structure(c(4L, 3L, 1L, 2L, 8L, 9L, 5L, 
6L, 7L), .Label = c("abc-0001-N", "abc-0001-P1", "abc-0001-T", 
"abc-0001-T-P0", "abc-0002-N-SFT", "abc-0002-P1", "abc-0002-P2", 
"abc-0002-T-P0", "abc-0002-T-SFT"), class = "factor"), ID2 = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("abc-0001", "abc-0002"
), class = "factor"), Sample_ID = structure(c(2L, 2L, 1L, 2L, 
4L, 4L, 3L, 5L, 5L), .Label = c("abc-0001-N", "abc-0001-T", "abc-0002-N", 
"abc-0002-T", "abc-00021-T"), class = "factor"), Type = structure(c(1L, 
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("cells", "frozen"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 2L, 3L, 
3L, 3L, 4L, 5L), .Label = c("3/22/16", "3/29/16", "4/03/16", 
"5/05/16", "5/19/16"), class = "factor"), Passage = structure(c(1L, 
NA, NA, 2L, NA, NA, 1L, 2L, 2L), .Label = c("P0", "P1"), class = "factor"), 
    colonies = c(23L, NA, NA, 30L, NA, NA, 5L, 18L, 27L)), .Names = c("ID1", 
"ID2", "Sample_ID", "Type", "Date", "Passage", "colonies"), row.names = c(NA, 
-9L), class = c("tbl_df", "tbl", "data.frame"))

下面是实际生产数据的代码,

a %>% group_by(ID2) %>%  filter(!is.na(Passage))  %>%
         mutate(colonies_num = as.numeric(colonies), 
                days = colonies_num- first(colonies_num),
                ID = paste0(str_extract(ID, "^abc.\d+"), '-', Passage)) %>% 
     select(-colonies_num)
#> # A tibble: 9 x 20
#> # Groups:   ID2 [2]
#>      ID Derived_Sample_ID      ID2  Sample_ID `Derived_Sample_Type?`
#>   <chr>             <chr>    <chr>      <chr>                  <chr>
#> 1 NA-P0     abc-0001-T-p0 abc-0001 abc-0001-T                  cells
#> 2 NA-P0     abc-0002-T-p0 abc-0002 abc-0002-T                  cells
#> 3 NA-P1     abc-0002-T-p1 abc-0002 abc-0002-T                  cells
#> ...

或更好,

a %>% group_by(ID2) %>%  filter(!is.na(Passage))  %>%
    mutate(days = as.numeric(colonies)- first(as.numeric(colonies)),
           ID = paste0(str_extract(ID, "^abc.\d+"), '-', Passage))

不能post它作为评论,因为它太长了 我所做的唯一改变是将日期列转换为

的日期
a$`Date_Sample_Created?` <- as.Date(a$`Date_Sample_Created?`, format="%Y-%m-%d")  

希望对您有所帮助。

structure(list(ID = c("36", "37", "38", "45", "46", "47", "48", 
"57", "59", "121", "131", "132", "134", "206"), Derived_Sample_ID = c("abc-0001-T-p0", 
"abc-0001-T-SFT", "abc-0001-N-SFT", "abc-0002-T-p0", "abc-0002-T-SFT", 
"abc-0002-N-SFT", "abc-0002-T-p1", "abc-0002-T-p2", "abc-0001-T-CPT", 
"abc-0001-T-p1", "abc-0001-T-p2", "abc-0002-T-p3", "abc-0001-T-p3", 
"abc-0002-T-P4"), ID2 = c("abc-0001", "abc-0001", "abc-0001", 
"abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0001", 
"abc-0001", "abc-0001", "abc-0002", "abc-0001", "abc-0002"), 
    Sample_ID = c("abc-0001-T", "abc-0001-T", "abc-0001-N", "abc-0002-T", 
    "abc-0002-T", "abc-0002-N", "abc-0002-T", "abc-0002-T", "abc-0001-T", 
    "abc-0001-T", "abc-0001-T", "abc-0002-T", "abc-0001-T", "abc-0002-T"
    ), `Derived_Sample_Type?` = c("cells", "Frozen tissue", "Frozen tissue", 
    "cells", "Frozen tissue", "Frozen tissue", "cells", "cells", 
    "Frozen tissue", "cells", "cells", "cells", "cells", "cells"
    ), `DNA_concentration?` = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), `DNA_concentration?__1` = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Passage_#` = c("P0", 
    NA, NA, "P0", NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3", 
    "P4"), `Culture_Plate_Type?` = c("24-well plate", NA, NA, 
    "24-well plate", NA, NA, "24-well plate", "24-well plate", 
    NA, "24-well plate", "24-well plate", "24-well plate", "24-well plate", 
    "6-well plate"), `colonies_#` = c("23", NA, NA, "12", NA, 
    NA, "10", "8", NA, "23", "24", "14", "6", "21"), `Split_Ratio?` = c("NA", 
    NA, NA, "NA", NA, NA, "1:1", "1:1", NA, "1:1", "1:1", "1:2", 
    NA, "NA"), `DNA_concentration?__2` = c(NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA), `DNA_concentration?__3` = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Date_Sample_Created?` = c("2017-07-26", 
    "2017-07-26", "2017-07-26", "2017-07-28", "2017-07-28", "2017-07-28", 
    "2017-07-31", "2017-08-11", "2017-07-26", "2017-08-21", "2017-08-22", 
    "2017-08-22", "2017-09-01", "2017-09-06"), `Time_Sample_Created?` = c("6:30PM", 
    "5:00PM", "5:00PM", "1:00PM", "11:00AM", "11:00AM", "04:30 PM", 
    "04:00 PM", "5:00PM", "10:00AM", "05:00 PM", "05:00 PM", 
    "01:30 PM", "4:00 PM"), `SOP_used?` = c("Culture of cellss - #5", 
    "Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5", 
    "Culture of Lung Tumor cellss - #6", "Culture of Lung Tumor cellss - #6", 
    "Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5", 
    "Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5", 
    "Culture of cellss - #5", "Culture of cellss - #5"), `Any_changes_to_SOP?` = c("Yes", 
    "No", "No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes", 
    "Yes", "Yes", "Yes", "Yes"), Passage = c("P0", NA, NA, "P0", 
    NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3", "P4"), colonies = c("23", 
    NA, NA, "12", NA, NA, "10", "8", NA, "23", "24", "14", "6", 
    "21")), .Names = c("ID", "Derived_Sample_ID", "ID2", "Sample_ID", 
"Derived_Sample_Type?", "DNA_concentration?", "DNA_concentration?__1", 
"Passage_#", "Culture_Plate_Type?", "colonies_#", "Split_Ratio?", 
"DNA_concentration?__2", "DNA_concentration?__3", "Date_Sample_Created?", 
"Time_Sample_Created?", "SOP_used?", "Any_changes_to_SOP?", "Passage", 
"colonies"), row.names = c(NA, -14L), class = c("tbl_df", "tbl", 
"data.frame"))

非常感谢!!!