R:在给定截止值数据帧的情况下计算矩阵的频率 table
R: calculate frequency table of a matrix given a data frame of cutoff values
假设我有一个如下所示的矩阵,每个 id
有 marker
个值,每个 id
有 10 个事件(在这个例子中):
set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
> head(mymat)
a b c d e f g h i j
id1 -0.56047565 0.4264642 0.3796395 0.9935039 0.1176466 0.7877388 -1.0633261 0.1192452 -0.7886220 0.8450130
id1 -0.23017749 -0.2950715 -0.5023235 0.5483970 -0.9474746 0.7690422 1.2631852 0.2436874 -0.5021987 0.9625280
id1 1.55870831 0.8951257 -0.3332074 0.2387317 -0.4905574 0.3322026 -0.3496504 1.2324759 1.4960607 0.6843094
id1 0.07050839 0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1 0.12928774 0.8215811 -1.0717912 1.3606524 1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516 0.8496430
id1 1.71506499 0.6886403 0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759 1.6756969 1.9023618 -0.4465572
以及截止值的相关数据框(每个 id
和 marker
的最小和最大截止值),就像这个:
cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
> head(cutoff_df)
id marker min max
1 id1 a 0.4744594 6.518271
2 id1 b 1.3729807 6.689669
3 id1 c 0.4516368 5.915843
4 id1 d 0.6369892 6.459263
5 id1 e 0.3479676 5.208157
6 id1 f 1.6028592 5.439966
我想在这里做的是计算一个频率 table,这样我就可以记录每个 id
和 marker
的事件百分比 marker
=15=] 和 marker
.
这是我尝试使用一些丑陋的嵌套循环...想知道是否有更好更简洁的方法来执行此操作,最好使用基本函数或 data.table 或 tidyr...
我丑陋的代码:
freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
mk_freq <- NULL
for (id in unique(rownames(mymat))){
data <- mymat[rownames(mymat)==id,mk]
min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
ins <- length(data[data>=min & data<=max])
freq <- ins/length(data)*100
mk_freq <- c(mk_freq, freq)
}
mk_freq <- as.data.frame(mk_freq)
names(mk_freq) <- mk
freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
freq_mat a b c d e f g h i j
id1 NA 20 0 20 40 10 0 30 10 20 30
id2 NA 10 30 30 0 20 10 10 0 0 70
id3 NA 0 0 0 0 30 10 30 10 30 60
是这样的吗?这里,所有单元格的总和是100。
library(tidyverse)
set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
id = paste0("id", rep(1:3, each = 10)),
marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)
mymat %>%
as_tibble(rownames = "id") %>%
pivot_longer(-id, names_to = "marker") %>%
left_join(cutoff_df) %>%
filter(value <= max & value >= min) %>%
count(id, marker) %>%
# group_by(marker) %>% # e.g. to make sum of 100 per marker
mutate(n = n / sum(n) * 100) %>%
pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#> id a c d e g h i j b f
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1 3.77 3.77 7.55 1.89 5.66 1.89 3.77 5.66 0 0
#> 2 id2 1.89 5.66 0 3.77 1.89 0 0 13.2 5.66 1.89
#> 3 id3 0 0 0 5.66 5.66 1.89 5.66 11.3 0 1.89
由 reprex package (v2.0.0)
创建于 2022-03-30
这是一个基于 purrr
包的解决方案。我不确定它是否更干净,但更短。
library(purrr)
asplit(mymat,2) |>
imap(~{
with(filter(cutoff_df, marker == .y),
outer(.x, min, ">=") &
outer(.x, max, "<") &
outer(names(.x), id, "=="))
}) |>
map(rowSums) |>
map_dfr(~tapply(.x, names(.x), FUN = sum),
.id = "marker")
##> + # A tibble: 10 × 4
##> marker id1 id2 id3
##> <chr> <dbl> <dbl> <dbl>
##> 1 a 2 1 0
##> 2 b 0 3 0
##> 3 c 2 3 0
##> 4 d 4 0 0
##> 5 e 1 2 3
##> 6 f 0 1 1
##> 7 g 3 1 3
##> 8 h 1 0 1
##> 9 i 2 0 3
##> 10 j 3 7 6
另一种可能的解决方案,基于dplyr
:
library(dplyr)
data.frame(id = rownames(mymat), mymat) %>%
group_by(id) %>%
summarise(across(everything(),
~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
.x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>%
mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)
#> # A tibble: 3 x 11
#> id a b c d e f g h i j
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1 3.77 0 3.77 7.55 1.89 0 5.66 1.89 3.77 5.66
#> 2 id2 1.89 5.66 5.66 0 3.77 1.89 1.89 0 0 13.2
#> 3 id3 0 0 0 0 5.66 1.89 5.66 1.89 5.66 11.3
假设我有一个如下所示的矩阵,每个 id
有 marker
个值,每个 id
有 10 个事件(在这个例子中):
set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
> head(mymat)
a b c d e f g h i j
id1 -0.56047565 0.4264642 0.3796395 0.9935039 0.1176466 0.7877388 -1.0633261 0.1192452 -0.7886220 0.8450130
id1 -0.23017749 -0.2950715 -0.5023235 0.5483970 -0.9474746 0.7690422 1.2631852 0.2436874 -0.5021987 0.9625280
id1 1.55870831 0.8951257 -0.3332074 0.2387317 -0.4905574 0.3322026 -0.3496504 1.2324759 1.4960607 0.6843094
id1 0.07050839 0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1 0.12928774 0.8215811 -1.0717912 1.3606524 1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516 0.8496430
id1 1.71506499 0.6886403 0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759 1.6756969 1.9023618 -0.4465572
以及截止值的相关数据框(每个 id
和 marker
的最小和最大截止值),就像这个:
cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
> head(cutoff_df)
id marker min max
1 id1 a 0.4744594 6.518271
2 id1 b 1.3729807 6.689669
3 id1 c 0.4516368 5.915843
4 id1 d 0.6369892 6.459263
5 id1 e 0.3479676 5.208157
6 id1 f 1.6028592 5.439966
我想在这里做的是计算一个频率 table,这样我就可以记录每个 id
和 marker
的事件百分比 marker
=15=] 和 marker
.
这是我尝试使用一些丑陋的嵌套循环...想知道是否有更好更简洁的方法来执行此操作,最好使用基本函数或 data.table 或 tidyr...
我丑陋的代码:
freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
mk_freq <- NULL
for (id in unique(rownames(mymat))){
data <- mymat[rownames(mymat)==id,mk]
min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
ins <- length(data[data>=min & data<=max])
freq <- ins/length(data)*100
mk_freq <- c(mk_freq, freq)
}
mk_freq <- as.data.frame(mk_freq)
names(mk_freq) <- mk
freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
freq_mat a b c d e f g h i j
id1 NA 20 0 20 40 10 0 30 10 20 30
id2 NA 10 30 30 0 20 10 10 0 0 70
id3 NA 0 0 0 0 30 10 30 10 30 60
是这样的吗?这里,所有单元格的总和是100。
library(tidyverse)
set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
id = paste0("id", rep(1:3, each = 10)),
marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)
mymat %>%
as_tibble(rownames = "id") %>%
pivot_longer(-id, names_to = "marker") %>%
left_join(cutoff_df) %>%
filter(value <= max & value >= min) %>%
count(id, marker) %>%
# group_by(marker) %>% # e.g. to make sum of 100 per marker
mutate(n = n / sum(n) * 100) %>%
pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#> id a c d e g h i j b f
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1 3.77 3.77 7.55 1.89 5.66 1.89 3.77 5.66 0 0
#> 2 id2 1.89 5.66 0 3.77 1.89 0 0 13.2 5.66 1.89
#> 3 id3 0 0 0 5.66 5.66 1.89 5.66 11.3 0 1.89
由 reprex package (v2.0.0)
创建于 2022-03-30这是一个基于 purrr
包的解决方案。我不确定它是否更干净,但更短。
library(purrr)
asplit(mymat,2) |>
imap(~{
with(filter(cutoff_df, marker == .y),
outer(.x, min, ">=") &
outer(.x, max, "<") &
outer(names(.x), id, "=="))
}) |>
map(rowSums) |>
map_dfr(~tapply(.x, names(.x), FUN = sum),
.id = "marker")
##> + # A tibble: 10 × 4
##> marker id1 id2 id3
##> <chr> <dbl> <dbl> <dbl>
##> 1 a 2 1 0
##> 2 b 0 3 0
##> 3 c 2 3 0
##> 4 d 4 0 0
##> 5 e 1 2 3
##> 6 f 0 1 1
##> 7 g 3 1 3
##> 8 h 1 0 1
##> 9 i 2 0 3
##> 10 j 3 7 6
另一种可能的解决方案,基于dplyr
:
library(dplyr)
data.frame(id = rownames(mymat), mymat) %>%
group_by(id) %>%
summarise(across(everything(),
~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
.x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>%
mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)
#> # A tibble: 3 x 11
#> id a b c d e f g h i j
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1 3.77 0 3.77 7.55 1.89 0 5.66 1.89 3.77 5.66
#> 2 id2 1.89 5.66 5.66 0 3.77 1.89 1.89 0 0 13.2
#> 3 id3 0 0 0 0 5.66 1.89 5.66 1.89 5.66 11.3