编码挑战:如果其他列中的行为真,则计算列值的总和(table 摘要)
Coding challenge: Calculate sum of col values if rows in other col are TRUE (table summary)
我们有一个 table,其中第一个列包含值(蛋白质计数),而随后的列是逻辑向量(T 或 F,表示 protein_id 是否具有 属性) .对于每个列,我们寻求所有值的总和,其中 col = T 和 T 的计数。
使用示例数据,任务可能更适合描述:
[请原谅示例数据需要随机 ID 生成器包,
如果您知道基本的 R 解决方案,请发表评论,我会在此处包含它。
library("stringi")
value <- c(sample(2:5, 20 , replace=T),
sample(6:10, 20 , replace=T),
sample(1:7, 20 , replace=T),
sample(3:10, 20 , replace=T),
sample(10:20, 20 , replace=T) )
data <- data.frame(
id = stringi::stri_rand_strings(20, 5),
value = value,
nucleus = sample(c(TRUE,FALSE), 20, TRUE),
membrane = sample(c(TRUE,FALSE), 20, TRUE),
mitochondria = sample(c(TRUE,FALSE), 20, TRUE))
对于每个 属性 列,我们寻求所有值的总和和所有 ID 的计数。接下来,检查多个列中是否为 TRUE。如果是:新的 col 与所有 colnames 的字符串 sep by _ 和所有值的总和。最后一个包含所有 ID 的列 sep by ;
expected_result_1 <- data.frame(
property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
value_sum = c('x', 'y', 'z', 'w'),
n_ids = c(4, 3, 1, 2),
ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))
dplyr 解决方案会很棒!
谢谢!
塞巴斯蒂安
这是您要找的吗? (以下代码已被编辑!)
data %<>%
pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")
data %<>% filter(is_true)
data %>%
group_by(property) %>%
transmute(value_sum = sum(value),
ids = paste0(id, collapse = ","),
n_ids = length(str_split(ids, ","))) %>%
ungroup() %>%
distinct(property, .keep_all = TRUE)
# # A tibble: 3 x 4
# property value_sum ids n_ids
# <chr> <int> <chr> <int>
# 1 membrane 400 ACiov,3XyaR,z68K4,GaUxZ,YZioV,mEZtJ,J5T67,T6Ine,a33ed,Flgnx,g33vm,ACiov,3XyaR,z68K4,GaU… 55
# 2 nucleus 312 tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8EgQ,Sl9nM,T6Ine,2zgbM,tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8E… 45
# 3 mitochondr… 310 tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6Ine,ViWyl,2zgbM,g33vm,tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6I… 45
更新:
我误解了OP的问题。我在这里包括一个仍然使用 dplyr
(某种程度上)并使用 gtools
来计算新列名称的解决方案。
library("stringi")
library("stringr")
library("dplyr")
library("magrittr")
library("tidyr")
library("gtools")
#Sample data.
value <- c(sample(2:5, 20 , replace=T),
sample(6:10, 20 , replace=T),
sample(1:7, 20 , replace=T),
sample(3:10, 20 , replace=T),
sample(10:20, 20 , replace=T) )
data <- data.frame(
id = stringi::stri_rand_strings(20, 5),
value = value,
nucleus = sample(c(TRUE,FALSE), 20, TRUE),
membrane = sample(c(TRUE,FALSE), 20, TRUE),
mitochondria = sample(c(TRUE,FALSE), 20, TRUE))
expected_result_1 <- data.frame(
property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
value_sum = c('x', 'y', 'z', 'w'),
n_ids = c(4, 3, 1, 2),
ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))
#SOLUTION.
#Getting the column names.
cols <- data %>% select(-matches("id|value")) %>% colnames()
#Don't even ask. Suffices to know that generously using lapply()
#enough times can convert anything into anything.
myfunc <- function(x){
unique(
unlist(
lapply(
lapply(
str_split(
apply(
gtools::permutations(length(cols), x, v = cols),
1, paste, collapse = "_"),
"_"),
sort),
paste, collapse = "_")
)
)
}
#Using the friendly function from above to get all possible
#combinations of the relevant column names.
newcols <- unlist(lapply(2:length(cols), myfunc))
#Adding these columns to the data.frame and evaluating them conditionally.
#(So if all corresponding individual columns are TRUE, then the new column
#is also TRUE, and FALSE otherwise.)
for(i in 1:length(newcols)){
#i <- 1
currefs <- unlist(str_split(newcols[i], "_"))
data[, newcols[i]] <- NA
data[, newcols[i]] <- eval(parse(text = paste0("data$", currefs, collapse = " & ")))
#for(j in 1:nrow(data)){
# data[j, newcols[i]] <- eval(parse(text = paste0("data$", currefs, "[j]", collapse = " & ")))
#}
#eval(parse(text = paste0("data$", currefs)))
}
#Pivoting longer to gather all these columns together.
data %<>%
pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")
#Retaining only TRUE values.
data %<>% filter(is_true)
#Calculating value_sum, ids, and n_ids.
data %>%
group_by(property) %>%
transmute(value_sum = sum(value),
ids = paste0(unique(id), collapse = ","),
n_ids = length(unique(unlist(str_split(ids, ","))))) %>%
ungroup() %>%
distinct(property, .keep_all = TRUE)
# # A tibble: 7 x 4
# property value_sum ids n_ids
# <chr> <int> <chr> <int>
# 1 nucleus 472 LpA3Q,2s04A,m1QMR,Lh4HH,wILrx,xKLfq,hUvsn,22cPw,NMTgz,V42mZ,GnJBd,Fwjbr… 13
# 2 membrane 521 LpA3Q,m1QMR,kYSIh,Lh4HH,CDPr4,wILrx,qAPpb,Zfavp,hUvsn,22cPw,NMTgz,N0RPZ… 14
# 3 mitochondria 252 LpA3Q,kYSIh,Zfavp,N0RPZ,V42mZ,GnJBd,onM21 7
# 4 membrane_mitochondria 182 LpA3Q,kYSIh,Zfavp,N0RPZ,GnJBd 5
# 5 membrane_nucleus 338 LpA3Q,m1QMR,Lh4HH,wILrx,hUvsn,22cPw,NMTgz,GnJBd,Fwjbr 9
# 6 mitochondria_nucleus 142 LpA3Q,V42mZ,GnJBd,onM21 4
# 7 membrane_mitochondria_nucl… 72 LpA3Q,GnJBd 2
我不确定下面的代码是否可以给出所需的输出,但这是一个基本的 R 尝试。
- 首先,我们可以定义一个用户函数
f
,它有助于按属性汇总data
中的信息
f <- function(cols) {
idx <- rowSums(data[cols]) == length(cols)
data.frame(
property = paste0(cols, collapse = "_"),
value_sum = sum(data$value[idx],na.rm = TRUE),
n_ids = length(unique(data$id[idx])),
ids = toString(unique(data$id[idx]))
)
}
- 然后,我们 select 列(参见
v
是 selected 列名称的向量),运行 下面的代码
v <- c("nucleus", "membrane", "mitochondria")
output <- do.call(
rbind,
unlist(
lapply(
seq_along(v),
function(k) combn(v, k, FUN = f, simplify = FALSE)
),
recursive = FALSE
)
)
我们会得到
> output
property value_sum n_ids
1 nucleus 406 11
2 membrane 367 10
3 mitochondria 278 8
4 nucleus_membrane 193 5
5 nucleus_mitochondria 135 4
6 membrane_mitochondria 136 4
7 nucleus_membrane_mitochondria 37 1
ids
1 zMknh, TUJhp, QVf8L, P5vps, w4NX6, 2IVbG, AT0RG, SxiO7, ErRUg, 1wIAO, YgefT
2 P5vps, w4NX6, nj3Tv, 2IVbG, xRMA3, eZzb4, ErRUg, l9qwa, SQWq3, YgefT
3 P5vps, QMw74, eZzb4, AT0RG, SxiO7, l9qwa, 1wIAO, SQWq3
4 P5vps, w4NX6, 2IVbG, ErRUg, YgefT
5 P5vps, AT0RG, SxiO7, 1wIAO
6 P5vps, eZzb4, l9qwa, SQWq3
7 P5vps
我们有一个 table,其中第一个列包含值(蛋白质计数),而随后的列是逻辑向量(T 或 F,表示 protein_id 是否具有 属性) .对于每个列,我们寻求所有值的总和,其中 col = T 和 T 的计数。
使用示例数据,任务可能更适合描述:
[请原谅示例数据需要随机 ID 生成器包, 如果您知道基本的 R 解决方案,请发表评论,我会在此处包含它。
library("stringi")
value <- c(sample(2:5, 20 , replace=T),
sample(6:10, 20 , replace=T),
sample(1:7, 20 , replace=T),
sample(3:10, 20 , replace=T),
sample(10:20, 20 , replace=T) )
data <- data.frame(
id = stringi::stri_rand_strings(20, 5),
value = value,
nucleus = sample(c(TRUE,FALSE), 20, TRUE),
membrane = sample(c(TRUE,FALSE), 20, TRUE),
mitochondria = sample(c(TRUE,FALSE), 20, TRUE))
对于每个 属性 列,我们寻求所有值的总和和所有 ID 的计数。接下来,检查多个列中是否为 TRUE。如果是:新的 col 与所有 colnames 的字符串 sep by _ 和所有值的总和。最后一个包含所有 ID 的列 sep by ;
expected_result_1 <- data.frame(
property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
value_sum = c('x', 'y', 'z', 'w'),
n_ids = c(4, 3, 1, 2),
ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))
dplyr 解决方案会很棒!
谢谢!
塞巴斯蒂安
这是您要找的吗? (以下代码已被编辑!)
data %<>%
pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")
data %<>% filter(is_true)
data %>%
group_by(property) %>%
transmute(value_sum = sum(value),
ids = paste0(id, collapse = ","),
n_ids = length(str_split(ids, ","))) %>%
ungroup() %>%
distinct(property, .keep_all = TRUE)
# # A tibble: 3 x 4
# property value_sum ids n_ids
# <chr> <int> <chr> <int>
# 1 membrane 400 ACiov,3XyaR,z68K4,GaUxZ,YZioV,mEZtJ,J5T67,T6Ine,a33ed,Flgnx,g33vm,ACiov,3XyaR,z68K4,GaU… 55
# 2 nucleus 312 tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8EgQ,Sl9nM,T6Ine,2zgbM,tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8E… 45
# 3 mitochondr… 310 tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6Ine,ViWyl,2zgbM,g33vm,tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6I… 45
更新:
我误解了OP的问题。我在这里包括一个仍然使用 dplyr
(某种程度上)并使用 gtools
来计算新列名称的解决方案。
library("stringi")
library("stringr")
library("dplyr")
library("magrittr")
library("tidyr")
library("gtools")
#Sample data.
value <- c(sample(2:5, 20 , replace=T),
sample(6:10, 20 , replace=T),
sample(1:7, 20 , replace=T),
sample(3:10, 20 , replace=T),
sample(10:20, 20 , replace=T) )
data <- data.frame(
id = stringi::stri_rand_strings(20, 5),
value = value,
nucleus = sample(c(TRUE,FALSE), 20, TRUE),
membrane = sample(c(TRUE,FALSE), 20, TRUE),
mitochondria = sample(c(TRUE,FALSE), 20, TRUE))
expected_result_1 <- data.frame(
property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
value_sum = c('x', 'y', 'z', 'w'),
n_ids = c(4, 3, 1, 2),
ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))
#SOLUTION.
#Getting the column names.
cols <- data %>% select(-matches("id|value")) %>% colnames()
#Don't even ask. Suffices to know that generously using lapply()
#enough times can convert anything into anything.
myfunc <- function(x){
unique(
unlist(
lapply(
lapply(
str_split(
apply(
gtools::permutations(length(cols), x, v = cols),
1, paste, collapse = "_"),
"_"),
sort),
paste, collapse = "_")
)
)
}
#Using the friendly function from above to get all possible
#combinations of the relevant column names.
newcols <- unlist(lapply(2:length(cols), myfunc))
#Adding these columns to the data.frame and evaluating them conditionally.
#(So if all corresponding individual columns are TRUE, then the new column
#is also TRUE, and FALSE otherwise.)
for(i in 1:length(newcols)){
#i <- 1
currefs <- unlist(str_split(newcols[i], "_"))
data[, newcols[i]] <- NA
data[, newcols[i]] <- eval(parse(text = paste0("data$", currefs, collapse = " & ")))
#for(j in 1:nrow(data)){
# data[j, newcols[i]] <- eval(parse(text = paste0("data$", currefs, "[j]", collapse = " & ")))
#}
#eval(parse(text = paste0("data$", currefs)))
}
#Pivoting longer to gather all these columns together.
data %<>%
pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")
#Retaining only TRUE values.
data %<>% filter(is_true)
#Calculating value_sum, ids, and n_ids.
data %>%
group_by(property) %>%
transmute(value_sum = sum(value),
ids = paste0(unique(id), collapse = ","),
n_ids = length(unique(unlist(str_split(ids, ","))))) %>%
ungroup() %>%
distinct(property, .keep_all = TRUE)
# # A tibble: 7 x 4
# property value_sum ids n_ids
# <chr> <int> <chr> <int>
# 1 nucleus 472 LpA3Q,2s04A,m1QMR,Lh4HH,wILrx,xKLfq,hUvsn,22cPw,NMTgz,V42mZ,GnJBd,Fwjbr… 13
# 2 membrane 521 LpA3Q,m1QMR,kYSIh,Lh4HH,CDPr4,wILrx,qAPpb,Zfavp,hUvsn,22cPw,NMTgz,N0RPZ… 14
# 3 mitochondria 252 LpA3Q,kYSIh,Zfavp,N0RPZ,V42mZ,GnJBd,onM21 7
# 4 membrane_mitochondria 182 LpA3Q,kYSIh,Zfavp,N0RPZ,GnJBd 5
# 5 membrane_nucleus 338 LpA3Q,m1QMR,Lh4HH,wILrx,hUvsn,22cPw,NMTgz,GnJBd,Fwjbr 9
# 6 mitochondria_nucleus 142 LpA3Q,V42mZ,GnJBd,onM21 4
# 7 membrane_mitochondria_nucl… 72 LpA3Q,GnJBd 2
我不确定下面的代码是否可以给出所需的输出,但这是一个基本的 R 尝试。
- 首先,我们可以定义一个用户函数
f
,它有助于按属性汇总data
中的信息
f <- function(cols) {
idx <- rowSums(data[cols]) == length(cols)
data.frame(
property = paste0(cols, collapse = "_"),
value_sum = sum(data$value[idx],na.rm = TRUE),
n_ids = length(unique(data$id[idx])),
ids = toString(unique(data$id[idx]))
)
}
- 然后,我们 select 列(参见
v
是 selected 列名称的向量),运行 下面的代码
v <- c("nucleus", "membrane", "mitochondria")
output <- do.call(
rbind,
unlist(
lapply(
seq_along(v),
function(k) combn(v, k, FUN = f, simplify = FALSE)
),
recursive = FALSE
)
)
我们会得到
> output
property value_sum n_ids
1 nucleus 406 11
2 membrane 367 10
3 mitochondria 278 8
4 nucleus_membrane 193 5
5 nucleus_mitochondria 135 4
6 membrane_mitochondria 136 4
7 nucleus_membrane_mitochondria 37 1
ids
1 zMknh, TUJhp, QVf8L, P5vps, w4NX6, 2IVbG, AT0RG, SxiO7, ErRUg, 1wIAO, YgefT
2 P5vps, w4NX6, nj3Tv, 2IVbG, xRMA3, eZzb4, ErRUg, l9qwa, SQWq3, YgefT
3 P5vps, QMw74, eZzb4, AT0RG, SxiO7, l9qwa, 1wIAO, SQWq3
4 P5vps, w4NX6, 2IVbG, ErRUg, YgefT
5 P5vps, AT0RG, SxiO7, 1wIAO
6 P5vps, eZzb4, l9qwa, SQWq3
7 P5vps