计算不同日期之间的天数(过滤大table)
Calculation of days between different dates (filter big table)
我正在尝试从数据库中过滤一个大 table,以便它以自动方式为我工作。
目前的标准是:
按相同分组 name/ID
过滤名称中的特定语法
过滤另一个标准
和 filter/dismiss 个没有值的条目(R 中的 NA)
唯一缺少的两个选项如下
每个 ID 都有一个特定的日期,一个 ID 字段和一个 Passage 字段(P0,P1,...)
我如何附加一个新列,首先检查 ID,然后检查通道,然后获取 P0 的日期并将中间的天数作为输出?
ID1 ID2 Sample_ID Type Date Passage colonies
abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30
abc-0002-P0 abc-0002 abc-0002-T cells 4/03/16 P0 5
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27
如果我将 abc-0001 样本的另一列作为输出,以及为 abc-0002 样本计算的引用相应 P0 日期的日期,那就太好了。
以及是否可以添加一个函数来创建天数和 colonie_count 的线图。 -->这个不如具体日期的计算重要
这是我当前的代码:
library(readxl)
library(stringr)
library(dplyr
b <- a %>%
select(ID1, ID2, Sample_ID, Type, Date, Passage, Colonies) %>%
group_by(ID2) %>%
filter(str_detect(ID2, 'abc')) %>%
filter(str_detect(ID1, 'cells')) %>%
filter(!is.na(Passage))
write.csv(b, file="test.csv")
如果你能帮助我,那就太好了。
最佳,
丹尼斯
添加更多示例是我的起点和目标:
ID1 ID2 Sample_ID Type Date Passage colonies
abc-0001-T-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23
abc-0001-T abc-0001 abc-0001-T frozen 3/22/16
abc-0001-N abc-0001 abc-0001-N frozen 3/22/16
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30
abc-0002-T-P0 abc-0002 abc-0002-T frozen 4/03/16
abc-0002-T-SFT abc-0002 abc-0002-T frozen 4/03/16
abc-0002-N-SFT abc-0002 abc-0002-N cells 4/03/16 P0 5
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27
我想讲这个:
ID1 ID2 Sample_ID Type Date Passage colonies days
abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23 0
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30 7
abc-0002-P0 abc-0002 abc-0002-T cells 4/03/16 P0 5 0
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18 32
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27 46
从这个table我很想为
天数(x 轴)和菌落数(y 轴)每个地块的名称将是 Sample_ID.
如果可以的话就太好了。谢谢!!
像这样能做到吗,
tbl %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(days = colonies- first(colonies),
ID1 = paste0(str_extract(ID1, "^abc.\d+"), '-', Passage))
#> # A tibble: 5 x 8
#> # Groups: ID2 [2]
#> ID1 ID2 Sample_ID Type Date Passage colonies days
#> <chr> <fctr> <fctr> <fctr> <fctr> <fctr> <int> <int>
#> 1 abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23 0
#> 2 abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30 7
#> 3 abc-0002-P0 abc-0002 abc-0002-N cells 4/03/16 P0 5 0
#> 4 abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18 13
#> 5 abc-0002-P1 abc-0002 abc-00021-T cells 5/19/16 P1 27 22
这是数据,以防其他人想试一试;
tbl <- structure(list(ID1 = structure(c(4L, 3L, 1L, 2L, 8L, 9L, 5L,
6L, 7L), .Label = c("abc-0001-N", "abc-0001-P1", "abc-0001-T",
"abc-0001-T-P0", "abc-0002-N-SFT", "abc-0002-P1", "abc-0002-P2",
"abc-0002-T-P0", "abc-0002-T-SFT"), class = "factor"), ID2 = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("abc-0001", "abc-0002"
), class = "factor"), Sample_ID = structure(c(2L, 2L, 1L, 2L,
4L, 4L, 3L, 5L, 5L), .Label = c("abc-0001-N", "abc-0001-T", "abc-0002-N",
"abc-0002-T", "abc-00021-T"), class = "factor"), Type = structure(c(1L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("cells", "frozen"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 2L, 3L,
3L, 3L, 4L, 5L), .Label = c("3/22/16", "3/29/16", "4/03/16",
"5/05/16", "5/19/16"), class = "factor"), Passage = structure(c(1L,
NA, NA, 2L, NA, NA, 1L, 2L, 2L), .Label = c("P0", "P1"), class = "factor"),
colonies = c(23L, NA, NA, 30L, NA, NA, 5L, 18L, 27L)), .Names = c("ID1",
"ID2", "Sample_ID", "Type", "Date", "Passage", "colonies"), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
下面是实际生产数据的代码,
a %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(colonies_num = as.numeric(colonies),
days = colonies_num- first(colonies_num),
ID = paste0(str_extract(ID, "^abc.\d+"), '-', Passage)) %>%
select(-colonies_num)
#> # A tibble: 9 x 20
#> # Groups: ID2 [2]
#> ID Derived_Sample_ID ID2 Sample_ID `Derived_Sample_Type?`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 NA-P0 abc-0001-T-p0 abc-0001 abc-0001-T cells
#> 2 NA-P0 abc-0002-T-p0 abc-0002 abc-0002-T cells
#> 3 NA-P1 abc-0002-T-p1 abc-0002 abc-0002-T cells
#> ...
或更好,
a %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(days = as.numeric(colonies)- first(as.numeric(colonies)),
ID = paste0(str_extract(ID, "^abc.\d+"), '-', Passage))
不能post它作为评论,因为它太长了
我所做的唯一改变是将日期列转换为
的日期
a$`Date_Sample_Created?` <- as.Date(a$`Date_Sample_Created?`, format="%Y-%m-%d")
希望对您有所帮助。
structure(list(ID = c("36", "37", "38", "45", "46", "47", "48",
"57", "59", "121", "131", "132", "134", "206"), Derived_Sample_ID = c("abc-0001-T-p0",
"abc-0001-T-SFT", "abc-0001-N-SFT", "abc-0002-T-p0", "abc-0002-T-SFT",
"abc-0002-N-SFT", "abc-0002-T-p1", "abc-0002-T-p2", "abc-0001-T-CPT",
"abc-0001-T-p1", "abc-0001-T-p2", "abc-0002-T-p3", "abc-0001-T-p3",
"abc-0002-T-P4"), ID2 = c("abc-0001", "abc-0001", "abc-0001",
"abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0001",
"abc-0001", "abc-0001", "abc-0002", "abc-0001", "abc-0002"),
Sample_ID = c("abc-0001-T", "abc-0001-T", "abc-0001-N", "abc-0002-T",
"abc-0002-T", "abc-0002-N", "abc-0002-T", "abc-0002-T", "abc-0001-T",
"abc-0001-T", "abc-0001-T", "abc-0002-T", "abc-0001-T", "abc-0002-T"
), `Derived_Sample_Type?` = c("cells", "Frozen tissue", "Frozen tissue",
"cells", "Frozen tissue", "Frozen tissue", "cells", "cells",
"Frozen tissue", "cells", "cells", "cells", "cells", "cells"
), `DNA_concentration?` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), `DNA_concentration?__1` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Passage_#` = c("P0",
NA, NA, "P0", NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3",
"P4"), `Culture_Plate_Type?` = c("24-well plate", NA, NA,
"24-well plate", NA, NA, "24-well plate", "24-well plate",
NA, "24-well plate", "24-well plate", "24-well plate", "24-well plate",
"6-well plate"), `colonies_#` = c("23", NA, NA, "12", NA,
NA, "10", "8", NA, "23", "24", "14", "6", "21"), `Split_Ratio?` = c("NA",
NA, NA, "NA", NA, NA, "1:1", "1:1", NA, "1:1", "1:1", "1:2",
NA, "NA"), `DNA_concentration?__2` = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), `DNA_concentration?__3` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Date_Sample_Created?` = c("2017-07-26",
"2017-07-26", "2017-07-26", "2017-07-28", "2017-07-28", "2017-07-28",
"2017-07-31", "2017-08-11", "2017-07-26", "2017-08-21", "2017-08-22",
"2017-08-22", "2017-09-01", "2017-09-06"), `Time_Sample_Created?` = c("6:30PM",
"5:00PM", "5:00PM", "1:00PM", "11:00AM", "11:00AM", "04:30 PM",
"04:00 PM", "5:00PM", "10:00AM", "05:00 PM", "05:00 PM",
"01:30 PM", "4:00 PM"), `SOP_used?` = c("Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of Lung Tumor cellss - #6", "Culture of Lung Tumor cellss - #6",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5"), `Any_changes_to_SOP?` = c("Yes",
"No", "No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes",
"Yes", "Yes", "Yes", "Yes"), Passage = c("P0", NA, NA, "P0",
NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3", "P4"), colonies = c("23",
NA, NA, "12", NA, NA, "10", "8", NA, "23", "24", "14", "6",
"21")), .Names = c("ID", "Derived_Sample_ID", "ID2", "Sample_ID",
"Derived_Sample_Type?", "DNA_concentration?", "DNA_concentration?__1",
"Passage_#", "Culture_Plate_Type?", "colonies_#", "Split_Ratio?",
"DNA_concentration?__2", "DNA_concentration?__3", "Date_Sample_Created?",
"Time_Sample_Created?", "SOP_used?", "Any_changes_to_SOP?", "Passage",
"colonies"), row.names = c(NA, -14L), class = c("tbl_df", "tbl",
"data.frame"))
非常感谢!!!
我正在尝试从数据库中过滤一个大 table,以便它以自动方式为我工作。
目前的标准是: 按相同分组 name/ID 过滤名称中的特定语法 过滤另一个标准 和 filter/dismiss 个没有值的条目(R 中的 NA)
唯一缺少的两个选项如下 每个 ID 都有一个特定的日期,一个 ID 字段和一个 Passage 字段(P0,P1,...) 我如何附加一个新列,首先检查 ID,然后检查通道,然后获取 P0 的日期并将中间的天数作为输出?
ID1 ID2 Sample_ID Type Date Passage colonies
abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30
abc-0002-P0 abc-0002 abc-0002-T cells 4/03/16 P0 5
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27
如果我将 abc-0001 样本的另一列作为输出,以及为 abc-0002 样本计算的引用相应 P0 日期的日期,那就太好了。
以及是否可以添加一个函数来创建天数和 colonie_count 的线图。 -->这个不如具体日期的计算重要
这是我当前的代码:
library(readxl)
library(stringr)
library(dplyr
b <- a %>%
select(ID1, ID2, Sample_ID, Type, Date, Passage, Colonies) %>%
group_by(ID2) %>%
filter(str_detect(ID2, 'abc')) %>%
filter(str_detect(ID1, 'cells')) %>%
filter(!is.na(Passage))
write.csv(b, file="test.csv")
如果你能帮助我,那就太好了。
最佳,
丹尼斯
添加更多示例是我的起点和目标:
ID1 ID2 Sample_ID Type Date Passage colonies
abc-0001-T-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23
abc-0001-T abc-0001 abc-0001-T frozen 3/22/16
abc-0001-N abc-0001 abc-0001-N frozen 3/22/16
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30
abc-0002-T-P0 abc-0002 abc-0002-T frozen 4/03/16
abc-0002-T-SFT abc-0002 abc-0002-T frozen 4/03/16
abc-0002-N-SFT abc-0002 abc-0002-N cells 4/03/16 P0 5
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27
我想讲这个:
ID1 ID2 Sample_ID Type Date Passage colonies days
abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23 0
abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30 7
abc-0002-P0 abc-0002 abc-0002-T cells 4/03/16 P0 5 0
abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18 32
abc-0002-P2 abc-0002 abc-00021-T cells 5/19/16 P1 27 46
从这个table我很想为 天数(x 轴)和菌落数(y 轴)每个地块的名称将是 Sample_ID.
如果可以的话就太好了。谢谢!!
像这样能做到吗,
tbl %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(days = colonies- first(colonies),
ID1 = paste0(str_extract(ID1, "^abc.\d+"), '-', Passage))
#> # A tibble: 5 x 8
#> # Groups: ID2 [2]
#> ID1 ID2 Sample_ID Type Date Passage colonies days
#> <chr> <fctr> <fctr> <fctr> <fctr> <fctr> <int> <int>
#> 1 abc-0001-P0 abc-0001 abc-0001-T cells 3/22/16 P0 23 0
#> 2 abc-0001-P1 abc-0001 abc-0001-T cells 3/29/16 P1 30 7
#> 3 abc-0002-P0 abc-0002 abc-0002-N cells 4/03/16 P0 5 0
#> 4 abc-0002-P1 abc-0002 abc-00021-T cells 5/05/16 P1 18 13
#> 5 abc-0002-P1 abc-0002 abc-00021-T cells 5/19/16 P1 27 22
这是数据,以防其他人想试一试;
tbl <- structure(list(ID1 = structure(c(4L, 3L, 1L, 2L, 8L, 9L, 5L,
6L, 7L), .Label = c("abc-0001-N", "abc-0001-P1", "abc-0001-T",
"abc-0001-T-P0", "abc-0002-N-SFT", "abc-0002-P1", "abc-0002-P2",
"abc-0002-T-P0", "abc-0002-T-SFT"), class = "factor"), ID2 = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("abc-0001", "abc-0002"
), class = "factor"), Sample_ID = structure(c(2L, 2L, 1L, 2L,
4L, 4L, 3L, 5L, 5L), .Label = c("abc-0001-N", "abc-0001-T", "abc-0002-N",
"abc-0002-T", "abc-00021-T"), class = "factor"), Type = structure(c(1L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("cells", "frozen"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 2L, 3L,
3L, 3L, 4L, 5L), .Label = c("3/22/16", "3/29/16", "4/03/16",
"5/05/16", "5/19/16"), class = "factor"), Passage = structure(c(1L,
NA, NA, 2L, NA, NA, 1L, 2L, 2L), .Label = c("P0", "P1"), class = "factor"),
colonies = c(23L, NA, NA, 30L, NA, NA, 5L, 18L, 27L)), .Names = c("ID1",
"ID2", "Sample_ID", "Type", "Date", "Passage", "colonies"), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
下面是实际生产数据的代码,
a %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(colonies_num = as.numeric(colonies),
days = colonies_num- first(colonies_num),
ID = paste0(str_extract(ID, "^abc.\d+"), '-', Passage)) %>%
select(-colonies_num)
#> # A tibble: 9 x 20
#> # Groups: ID2 [2]
#> ID Derived_Sample_ID ID2 Sample_ID `Derived_Sample_Type?`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 NA-P0 abc-0001-T-p0 abc-0001 abc-0001-T cells
#> 2 NA-P0 abc-0002-T-p0 abc-0002 abc-0002-T cells
#> 3 NA-P1 abc-0002-T-p1 abc-0002 abc-0002-T cells
#> ...
或更好,
a %>% group_by(ID2) %>% filter(!is.na(Passage)) %>%
mutate(days = as.numeric(colonies)- first(as.numeric(colonies)),
ID = paste0(str_extract(ID, "^abc.\d+"), '-', Passage))
不能post它作为评论,因为它太长了 我所做的唯一改变是将日期列转换为
的日期a$`Date_Sample_Created?` <- as.Date(a$`Date_Sample_Created?`, format="%Y-%m-%d")
希望对您有所帮助。
structure(list(ID = c("36", "37", "38", "45", "46", "47", "48",
"57", "59", "121", "131", "132", "134", "206"), Derived_Sample_ID = c("abc-0001-T-p0",
"abc-0001-T-SFT", "abc-0001-N-SFT", "abc-0002-T-p0", "abc-0002-T-SFT",
"abc-0002-N-SFT", "abc-0002-T-p1", "abc-0002-T-p2", "abc-0001-T-CPT",
"abc-0001-T-p1", "abc-0001-T-p2", "abc-0002-T-p3", "abc-0001-T-p3",
"abc-0002-T-P4"), ID2 = c("abc-0001", "abc-0001", "abc-0001",
"abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0002", "abc-0001",
"abc-0001", "abc-0001", "abc-0002", "abc-0001", "abc-0002"),
Sample_ID = c("abc-0001-T", "abc-0001-T", "abc-0001-N", "abc-0002-T",
"abc-0002-T", "abc-0002-N", "abc-0002-T", "abc-0002-T", "abc-0001-T",
"abc-0001-T", "abc-0001-T", "abc-0002-T", "abc-0001-T", "abc-0002-T"
), `Derived_Sample_Type?` = c("cells", "Frozen tissue", "Frozen tissue",
"cells", "Frozen tissue", "Frozen tissue", "cells", "cells",
"Frozen tissue", "cells", "cells", "cells", "cells", "cells"
), `DNA_concentration?` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), `DNA_concentration?__1` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Passage_#` = c("P0",
NA, NA, "P0", NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3",
"P4"), `Culture_Plate_Type?` = c("24-well plate", NA, NA,
"24-well plate", NA, NA, "24-well plate", "24-well plate",
NA, "24-well plate", "24-well plate", "24-well plate", "24-well plate",
"6-well plate"), `colonies_#` = c("23", NA, NA, "12", NA,
NA, "10", "8", NA, "23", "24", "14", "6", "21"), `Split_Ratio?` = c("NA",
NA, NA, "NA", NA, NA, "1:1", "1:1", NA, "1:1", "1:1", "1:2",
NA, "NA"), `DNA_concentration?__2` = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), `DNA_concentration?__3` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `Date_Sample_Created?` = c("2017-07-26",
"2017-07-26", "2017-07-26", "2017-07-28", "2017-07-28", "2017-07-28",
"2017-07-31", "2017-08-11", "2017-07-26", "2017-08-21", "2017-08-22",
"2017-08-22", "2017-09-01", "2017-09-06"), `Time_Sample_Created?` = c("6:30PM",
"5:00PM", "5:00PM", "1:00PM", "11:00AM", "11:00AM", "04:30 PM",
"04:00 PM", "5:00PM", "10:00AM", "05:00 PM", "05:00 PM",
"01:30 PM", "4:00 PM"), `SOP_used?` = c("Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of Lung Tumor cellss - #6", "Culture of Lung Tumor cellss - #6",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5", "Culture of cellss - #5",
"Culture of cellss - #5", "Culture of cellss - #5"), `Any_changes_to_SOP?` = c("Yes",
"No", "No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes",
"Yes", "Yes", "Yes", "Yes"), Passage = c("P0", NA, NA, "P0",
NA, NA, "P1", "P2", NA, "P1", "P2", "P3", "P3", "P4"), colonies = c("23",
NA, NA, "12", NA, NA, "10", "8", NA, "23", "24", "14", "6",
"21")), .Names = c("ID", "Derived_Sample_ID", "ID2", "Sample_ID",
"Derived_Sample_Type?", "DNA_concentration?", "DNA_concentration?__1",
"Passage_#", "Culture_Plate_Type?", "colonies_#", "Split_Ratio?",
"DNA_concentration?__2", "DNA_concentration?__3", "Date_Sample_Created?",
"Time_Sample_Created?", "SOP_used?", "Any_changes_to_SOP?", "Passage",
"colonies"), row.names = c(NA, -14L), class = c("tbl_df", "tbl",
"data.frame"))
非常感谢!!!