R:在给定截止值数据帧的情况下计算矩阵的频率 table

R: calculate frequency table of a matrix given a data frame of cutoff values

假设我有一个如下所示的矩阵,每个 idmarker 个值,每个 id 有 10 个事件(在这个例子中):

set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
>   head(mymat)
              a          b          c          d          e          f          g          h          i          j
id1 -0.56047565  0.4264642  0.3796395  0.9935039  0.1176466  0.7877388 -1.0633261  0.1192452 -0.7886220  0.8450130
id1 -0.23017749 -0.2950715 -0.5023235  0.5483970 -0.9474746  0.7690422  1.2631852  0.2436874 -0.5021987  0.9625280
id1  1.55870831  0.8951257 -0.3332074  0.2387317 -0.4905574  0.3322026 -0.3496504  1.2324759  1.4960607  0.6843094
id1  0.07050839  0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1  0.12928774  0.8215811 -1.0717912  1.3606524  1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516  0.8496430
id1  1.71506499  0.6886403  0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759  1.6756969  1.9023618 -0.4465572

以及截止值的相关数据框(每个 idmarker 的最小和最大截止值),就像这个:

cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
>   head(cutoff_df)
   id marker       min      max
1 id1      a 0.4744594 6.518271
2 id1      b 1.3729807 6.689669
3 id1      c 0.4516368 5.915843
4 id1      d 0.6369892 6.459263
5 id1      e 0.3479676 5.208157
6 id1      f 1.6028592 5.439966

我想在这里做的是计算一个频率 table,这样我就可以记录每个 idmarker 的事件百分比 marker =15=] 和 marker.

这是我尝试使用一些丑陋的嵌套循环...想知道是否有更好更简洁的方法来执行此操作,最好使用基本函数或 data.table 或 tidyr...

我丑陋的代码:

freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
  mk_freq <- NULL
  for (id in unique(rownames(mymat))){
    data <- mymat[rownames(mymat)==id,mk]
    min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
    max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
    ins <- length(data[data>=min & data<=max])
    freq <- ins/length(data)*100
    mk_freq <- c(mk_freq, freq)
  }
  mk_freq <- as.data.frame(mk_freq)
  names(mk_freq) <- mk
  freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
    freq_mat  a  b  c  d  e  f  g  h  i  j
id1       NA 20  0 20 40 10  0 30 10 20 30
id2       NA 10 30 30  0 20 10 10  0  0 70
id3       NA  0  0  0  0 30 10 30 10 30 60

是这样的吗?这里,所有单元格的总和是100。

library(tidyverse)

set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
  id = paste0("id", rep(1:3, each = 10)),
  marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)

mymat %>%
  as_tibble(rownames = "id") %>%
  pivot_longer(-id, names_to = "marker") %>%
  left_join(cutoff_df) %>%
  filter(value <= max & value >= min) %>%
  count(id, marker) %>%
  # group_by(marker) %>% # e.g. to make sum of 100 per marker
  mutate(n = n / sum(n) * 100) %>%
  pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#>   id        a     c     d     e     g     h     i     j     b     f
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  3.77  7.55  1.89  5.66  1.89  3.77  5.66  0     0   
#> 2 id2    1.89  5.66  0     3.77  1.89  0     0    13.2   5.66  1.89
#> 3 id3    0     0     0     5.66  5.66  1.89  5.66 11.3   0     1.89

reprex package (v2.0.0)

创建于 2022-03-30

这是一个基于 purrr 包的解决方案。我不确定它是否更干净,但更短。

library(purrr)
asplit(mymat,2) |>
  imap(~{
    with(filter(cutoff_df, marker == .y),
         outer(.x, min, ">=") &
         outer(.x, max, "<") &
         outer(names(.x), id, "=="))
  }) |>
  map(rowSums) |>
  map_dfr(~tapply(.x, names(.x), FUN = sum),
          .id = "marker")

##> + # A tibble: 10 × 4
##>    marker   id1   id2   id3
##>    <chr>  <dbl> <dbl> <dbl>
##>  1 a          2     1     0
##>  2 b          0     3     0
##>  3 c          2     3     0
##>  4 d          4     0     0
##>  5 e          1     2     3
##>  6 f          0     1     1
##>  7 g          3     1     3
##>  8 h          1     0     1
##>  9 i          2     0     3
##> 10 j          3     7     6

另一种可能的解决方案,基于dplyr

library(dplyr)

data.frame(id = rownames(mymat), mymat) %>% 
 group_by(id) %>% 
 summarise(across(everything(),
 ~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
      .x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>% 
 mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)

#> # A tibble: 3 x 11
#>   id        a     b     c     d     e     f     g     h     i     j
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  0     3.77  7.55  1.89  0     5.66  1.89  3.77  5.66
#> 2 id2    1.89  5.66  5.66  0     3.77  1.89  1.89  0     0    13.2 
#> 3 id3    0     0     0     0     5.66  1.89  5.66  1.89  5.66 11.3