编码挑战:如果其他列中的行为真,则计算列值的总和(table 摘要)

Coding challenge: Calculate sum of col values if rows in other col are TRUE (table summary)

我们有一个 table,其中第一个列包含值(蛋白质计数),而随后的列是逻辑向量(T 或 F,表示 protein_id 是否具有 属性) .对于每个列,我们寻求所有值的总和,其中 col = T 和 T 的计数。

使用示例数据,任务可能更适合描述:

[请原谅示例数据需要随机 ID 生成器包, 如果您知道基本的 R 解决方案,请发表评论,我会在此处包含它。

library("stringi")

value <- c(sample(2:5, 20 , replace=T),
           sample(6:10, 20 , replace=T), 
           sample(1:7, 20 ,  replace=T), 
           sample(3:10, 20 , replace=T), 
           sample(10:20, 20 , replace=T) )

data <- data.frame(
  id = stringi::stri_rand_strings(20, 5),
  value = value,
  nucleus = sample(c(TRUE,FALSE), 20, TRUE),
  membrane = sample(c(TRUE,FALSE), 20, TRUE),
  mitochondria = sample(c(TRUE,FALSE), 20, TRUE))

对于每个 属性 列,我们寻求所有值的总和和所有 ID 的计数。接下来,检查多个列中是否为 TRUE。如果是:新的 col 与所有 colnames 的字符串 sep by _ 和所有值的总和。最后一个包含所有 ID 的列 sep by ;

expected_result_1 <- data.frame(
  property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
  value_sum = c('x', 'y', 'z', 'w'),
  n_ids = c(4, 3, 1, 2),
  ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))

dplyr 解决方案会很棒!

谢谢!

塞巴斯蒂安

这是您要找的吗? (以下代码已被编辑!)

data %<>% 
  pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")

data %<>% filter(is_true)

data %>% 
  group_by(property) %>%
  transmute(value_sum = sum(value), 
            ids = paste0(id, collapse = ","), 
            n_ids = length(str_split(ids, ","))) %>%
  ungroup() %>% 
  distinct(property, .keep_all = TRUE)

# # A tibble: 3 x 4
# property    value_sum ids                                                                                      n_ids
# <chr>           <int> <chr>                                                                                    <int>
#   1 membrane          400 ACiov,3XyaR,z68K4,GaUxZ,YZioV,mEZtJ,J5T67,T6Ine,a33ed,Flgnx,g33vm,ACiov,3XyaR,z68K4,GaU…    55
#   2 nucleus           312 tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8EgQ,Sl9nM,T6Ine,2zgbM,tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8E…    45
#   3 mitochondr…       310 tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6Ine,ViWyl,2zgbM,g33vm,tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6I…    45

更新:

我误解了OP的问题。我在这里包括一个仍然使用 dplyr(某种程度上)并使用 gtools 来计算新列名称的解决方案。

library("stringi")
library("stringr")
library("dplyr")
library("magrittr")
library("tidyr")
library("gtools")

#Sample data.
value <- c(sample(2:5, 20 , replace=T),
           sample(6:10, 20 , replace=T), 
           sample(1:7, 20 ,  replace=T), 
           sample(3:10, 20 , replace=T), 
           sample(10:20, 20 , replace=T) )

data <- data.frame(
  id = stringi::stri_rand_strings(20, 5),
  value = value,
  nucleus = sample(c(TRUE,FALSE), 20, TRUE),
  membrane = sample(c(TRUE,FALSE), 20, TRUE),
  mitochondria = sample(c(TRUE,FALSE), 20, TRUE))


expected_result_1 <- data.frame(
  property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
  value_sum = c('x', 'y', 'z', 'w'),
  n_ids = c(4, 3, 1, 2),
  ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))




#SOLUTION.

#Getting the column names.
cols <- data %>% select(-matches("id|value")) %>% colnames()

#Don't even ask. Suffices to know that generously using lapply() 
#enough times can convert anything into anything.
myfunc <- function(x){
  unique(
    unlist(
      lapply(
        lapply(
          str_split(
            apply(
              gtools::permutations(length(cols), x, v = cols),
              1, paste, collapse = "_"),
            "_"), 
          sort), 
        paste, collapse = "_")
      )
    )
}

#Using the friendly function from above to get all possible 
#combinations of the relevant column names.
newcols <- unlist(lapply(2:length(cols), myfunc))

#Adding these columns to the data.frame and evaluating them conditionally.
#(So if all corresponding individual columns are TRUE, then the new column
#is also TRUE, and FALSE otherwise.)
for(i in 1:length(newcols)){
  #i <- 1
  
  currefs <- unlist(str_split(newcols[i], "_"))
  data[, newcols[i]] <- NA
  data[, newcols[i]] <- eval(parse(text = paste0("data$", currefs, collapse = " & ")))
  
  #for(j in 1:nrow(data)){
  #  data[j, newcols[i]] <- eval(parse(text = paste0("data$", currefs, "[j]", collapse = " & ")))
  #}
  #eval(parse(text = paste0("data$", currefs)))
  
}


#Pivoting longer to gather all these columns together.
data %<>% 
  pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")

#Retaining only TRUE values.
data %<>% filter(is_true)

#Calculating value_sum, ids, and n_ids.
data %>% 
  group_by(property) %>%
  transmute(value_sum = sum(value), 
            ids = paste0(unique(id), collapse = ","), 
            n_ids = length(unique(unlist(str_split(ids, ","))))) %>%
  ungroup() %>% 
  distinct(property, .keep_all = TRUE)


# # A tibble: 7 x 4
# property                    value_sum ids                                                                      n_ids
# <chr>                           <int> <chr>                                                                    <int>
# 1 nucleus                           472 LpA3Q,2s04A,m1QMR,Lh4HH,wILrx,xKLfq,hUvsn,22cPw,NMTgz,V42mZ,GnJBd,Fwjbr…    13
# 2 membrane                          521 LpA3Q,m1QMR,kYSIh,Lh4HH,CDPr4,wILrx,qAPpb,Zfavp,hUvsn,22cPw,NMTgz,N0RPZ…    14
# 3 mitochondria                      252 LpA3Q,kYSIh,Zfavp,N0RPZ,V42mZ,GnJBd,onM21                                    7
# 4 membrane_mitochondria             182 LpA3Q,kYSIh,Zfavp,N0RPZ,GnJBd                                                5
# 5 membrane_nucleus                  338 LpA3Q,m1QMR,Lh4HH,wILrx,hUvsn,22cPw,NMTgz,GnJBd,Fwjbr                        9
# 6 mitochondria_nucleus              142 LpA3Q,V42mZ,GnJBd,onM21                                                      4
# 7 membrane_mitochondria_nucl…        72 LpA3Q,GnJBd                                                                  2

我不确定下面的代码是否可以给出所需的输出,但这是一个基本的 R 尝试。

  • 首先,我们可以定义一个用户函数f,它有助于按属性汇总data
  • 中的信息
f <- function(cols) {
  idx <- rowSums(data[cols]) == length(cols)
  data.frame(
    property = paste0(cols, collapse = "_"),
    value_sum = sum(data$value[idx],na.rm = TRUE),
    n_ids = length(unique(data$id[idx])),
    ids = toString(unique(data$id[idx]))
  )
}
  • 然后,我们 select 列(参见 v 是 selected 列名称的向量),运行 下面的代码
v <- c("nucleus", "membrane", "mitochondria")
output <- do.call(
  rbind,
  unlist(
    lapply(
      seq_along(v),
      function(k) combn(v, k, FUN = f, simplify = FALSE)
    ),
    recursive = FALSE
  )
)

我们会得到

> output
                       property value_sum n_ids
1                       nucleus       406    11
2                      membrane       367    10
3                  mitochondria       278     8
4              nucleus_membrane       193     5
5          nucleus_mitochondria       135     4
6         membrane_mitochondria       136     4
7 nucleus_membrane_mitochondria        37     1
                                                                          ids
1 zMknh, TUJhp, QVf8L, P5vps, w4NX6, 2IVbG, AT0RG, SxiO7, ErRUg, 1wIAO, YgefT
2        P5vps, w4NX6, nj3Tv, 2IVbG, xRMA3, eZzb4, ErRUg, l9qwa, SQWq3, YgefT
3                      P5vps, QMw74, eZzb4, AT0RG, SxiO7, l9qwa, 1wIAO, SQWq3
4                                           P5vps, w4NX6, 2IVbG, ErRUg, YgefT
5                                                  P5vps, AT0RG, SxiO7, 1wIAO
6                                                  P5vps, eZzb4, l9qwa, SQWq3
7                                                                       P5vps