对于列的每个值,将该值除以另一列中具有相同序列的行数(使用 R 计算多映射读取)
For each value of a column divide that value by the number of rows that have the same sequence in another column (counting multimapped reads with R)
这是我的数据集的一部分:
structure(list(V1 = c("t00000406", "t00000517", "t00000519",
"t00000589", "t00000589", "t00000598", "t00000804", "t00000938",
"t00001008", "t00001156", "t00001156", "t00001156", "t00001165",
"t00001165", "t00001165", "t00001265", "t00001265", "t00001265",
"t00001511", "t00001562", "t00001562", "t00001599", "t00001703",
"t00001703", "t00001703", "t00001710", "t00001710", "t00001710",
"t00001710"), V2 = c(617L, 445L, 439L, 357L, 357L, 352L, 234L,
192L, 177L, 151L, 151L, 151L, 149L, 149L, 149L, 138L, 138L, 138L,
114L, 111L, 111L, 108L, 101L, 101L, 101L, 101L, 101L, 101L, 101L
), V4 = c("piR-hsa-3546", "piR-hsa-3454", "piR-hsa-3546", "piR-hsa-6909",
"piR-hsa-6908", "piR-hsa-3454", "piR-hsa-3454", "piR-hsa-3454",
"piR-hsa-3454", "piR-hsa-31261", "piR-hsa-14100", "piR-hsa-14099",
"piR-hsa-28592", "piR-hsa-6592", "piR-hsa-6591", "piR-hsa-14099",
"piR-hsa-31261", "piR-hsa-14100", "piR-hsa-6909", "piR-hsa-16270",
"piR-hsa-16271", "piR-hsa-620", "piR-hsa-31261", "piR-hsa-14100",
"piR-hsa-14099", "piR-hsa-14098", "piR-hsa-14100", "piR-hsa-14099",
"piR-hsa-31261"), V6 = c("CTGTTAACCGAAAGGTTGGTGGT", "CACGTGTTAGGACCCGAAAGA",
"CGGCTGTTAACCGAAAGGTTGGTGGT", "GTTTCCGTAGTGTAGTGGTCATC", "GTTTCCGTAGTGTAGTGGTCATC",
"ACGTGTTAGGACCCGAAAGA", "CGTGTTAGGACCCGAAAGA", "TGTTAGGACCCGAAAGA",
"CGCACGTGTTAGGACCCGAAAGA", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC",
"TCCCTGGTGGTCTAGTGGTTAGGATTCGGC", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC",
"GTAGTCGTGGCCGAGTGGTTAAG", "GTAGTCGTGGCCGAGTGGTTAAG", "GTAGTCGTGGCCGAGTGGTTAAG",
"TCCCTGGTGGTCTAGTGGTTAGGATT", "TCCCTGGTGGTCTAGTGGTTAGGATT", "TCCCTGGTGGTCTAGTGGTTAGGATT",
"GTTTCCGTAGTGTAGTGGTCATCACGTTCGCC", "CTGAGGGTCCAGGGT", "CTGAGGGTCCAGGGT",
"CGTAGTTCCGACCATAAACGATGCC", "TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGATTC",
"TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT",
"TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT")), row.names = c(NA,
-29L), class = c("tbl_df", "tbl", "data.frame"))
我想 "mutate" V2 列的每个值除以一个值,该值由数据中存在的相同 "sequence" 的次数计算得出。 ->
序列GTAGTCGTGGCCGAGTGGTTAAG存在3次
nrow(filter(my_data,V6=="GTAGTCGTGGCCGAGTGGTTAAG"))
[1] 3
结果:
filter(pir_onehun,V6=="GTAGTCGTGGCCGAGTGGTTAAG") %>% mutate(V2=V2/nrow(filter(pir_onehun,V6=="GTAGTCGTGGCCGAGTGGTTAAG")) )
# A tibble: 3 x 4
V1 V2 V4 V6
<chr> <dbl> <chr> <chr>
1 t00001165 49.7 piR-hsa-28592 GTAGTCGTGGCCGAGTGGTTAAG
2 t00001165 49.7 piR-hsa-6592 GTAGTCGTGGCCGAGTGGTTAAG
3 t00001165 49.7 piR-hsa-6591 GTAGTCGTGGCCGAGTGGTTAAG
我想用 bind_rows 将结果附加到一个新的数据框,但它必须是另一种更 "tidy" 的方式。
感谢您的宝贵时间
df %>% group_by(V6) %>% mutate(V2 = V2 / n())
完成任务。首先我们按顺序分组,然后除以这个组的大小,n()
.
这是我的数据集的一部分:
structure(list(V1 = c("t00000406", "t00000517", "t00000519",
"t00000589", "t00000589", "t00000598", "t00000804", "t00000938",
"t00001008", "t00001156", "t00001156", "t00001156", "t00001165",
"t00001165", "t00001165", "t00001265", "t00001265", "t00001265",
"t00001511", "t00001562", "t00001562", "t00001599", "t00001703",
"t00001703", "t00001703", "t00001710", "t00001710", "t00001710",
"t00001710"), V2 = c(617L, 445L, 439L, 357L, 357L, 352L, 234L,
192L, 177L, 151L, 151L, 151L, 149L, 149L, 149L, 138L, 138L, 138L,
114L, 111L, 111L, 108L, 101L, 101L, 101L, 101L, 101L, 101L, 101L
), V4 = c("piR-hsa-3546", "piR-hsa-3454", "piR-hsa-3546", "piR-hsa-6909",
"piR-hsa-6908", "piR-hsa-3454", "piR-hsa-3454", "piR-hsa-3454",
"piR-hsa-3454", "piR-hsa-31261", "piR-hsa-14100", "piR-hsa-14099",
"piR-hsa-28592", "piR-hsa-6592", "piR-hsa-6591", "piR-hsa-14099",
"piR-hsa-31261", "piR-hsa-14100", "piR-hsa-6909", "piR-hsa-16270",
"piR-hsa-16271", "piR-hsa-620", "piR-hsa-31261", "piR-hsa-14100",
"piR-hsa-14099", "piR-hsa-14098", "piR-hsa-14100", "piR-hsa-14099",
"piR-hsa-31261"), V6 = c("CTGTTAACCGAAAGGTTGGTGGT", "CACGTGTTAGGACCCGAAAGA",
"CGGCTGTTAACCGAAAGGTTGGTGGT", "GTTTCCGTAGTGTAGTGGTCATC", "GTTTCCGTAGTGTAGTGGTCATC",
"ACGTGTTAGGACCCGAAAGA", "CGTGTTAGGACCCGAAAGA", "TGTTAGGACCCGAAAGA",
"CGCACGTGTTAGGACCCGAAAGA", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC",
"TCCCTGGTGGTCTAGTGGTTAGGATTCGGC", "TCCCTGGTGGTCTAGTGGTTAGGATTCGGC",
"GTAGTCGTGGCCGAGTGGTTAAG", "GTAGTCGTGGCCGAGTGGTTAAG", "GTAGTCGTGGCCGAGTGGTTAAG",
"TCCCTGGTGGTCTAGTGGTTAGGATT", "TCCCTGGTGGTCTAGTGGTTAGGATT", "TCCCTGGTGGTCTAGTGGTTAGGATT",
"GTTTCCGTAGTGTAGTGGTCATCACGTTCGCC", "CTGAGGGTCCAGGGT", "CTGAGGGTCCAGGGT",
"CGTAGTTCCGACCATAAACGATGCC", "TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGATTC",
"TCCCTGGTGGTCTAGTGGTTAGGATTC", "TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT",
"TCCCTGGTGGTCTAGTGGTTAGGAT", "TCCCTGGTGGTCTAGTGGTTAGGAT")), row.names = c(NA,
-29L), class = c("tbl_df", "tbl", "data.frame"))
我想 "mutate" V2 列的每个值除以一个值,该值由数据中存在的相同 "sequence" 的次数计算得出。 ->
序列GTAGTCGTGGCCGAGTGGTTAAG存在3次
nrow(filter(my_data,V6=="GTAGTCGTGGCCGAGTGGTTAAG"))
[1] 3
结果:
filter(pir_onehun,V6=="GTAGTCGTGGCCGAGTGGTTAAG") %>% mutate(V2=V2/nrow(filter(pir_onehun,V6=="GTAGTCGTGGCCGAGTGGTTAAG")) )
# A tibble: 3 x 4
V1 V2 V4 V6
<chr> <dbl> <chr> <chr>
1 t00001165 49.7 piR-hsa-28592 GTAGTCGTGGCCGAGTGGTTAAG
2 t00001165 49.7 piR-hsa-6592 GTAGTCGTGGCCGAGTGGTTAAG
3 t00001165 49.7 piR-hsa-6591 GTAGTCGTGGCCGAGTGGTTAAG
我想用 bind_rows 将结果附加到一个新的数据框,但它必须是另一种更 "tidy" 的方式。
感谢您的宝贵时间
df %>% group_by(V6) %>% mutate(V2 = V2 / n())
完成任务。首先我们按顺序分组,然后除以这个组的大小,n()
.