运行 数据帧中所有对的方程,输出矩阵

Running a equation on all pairs in dataframe, output matrix

我有一个数据框,其中的行对应于每个人的测量值。我需要 运行 所有个体对的方程式,并将结果输出为矩阵,其值对应于每对个体的输出。

数据如下所示:

df = data.frame(
sample=c("sample01","sample02","sample03","sample04","sample05"),
start=c(233,99,288,313,346),
min_01=c(2.94,3.26,3.15,2.55,2.59),
min_02=c(4.22,4.97,3.51,4.14,4.12),
min_03=c(5.7,6.61,4.86,5.44,5.47),
min_04=c(7.15,8.26,6.3,7.14,7.04),
min_05=c(10.52,11.9,9.7,10.49,10.25),
min_06=c(13.81,15.51,13.02,14.55,14.62),
min_07=c(16.15,18.98,16.63,18.19,17.49),
min_08=c(15.34,18.43,15.83,17.86,17.08),
min_09=c(14.27,15.59,13.27,14.87,14.6),
min_10=c(9.83,10.9,9,10.14,9.83),
min_11=c(5.53,5.95,4.31,5.26,5.18),
min_12=c(3.12,2.98,2.96,2.35,2.3),
max_01=c(13.13,14.1,14.92,14.46,13.34),
max_02=c(15.83,16.92,16.86,16.35,15.74),
max_03=c(18.49,19.75,19.23,18.99,18.47),
max_04=c(22.86,23.46,22.99,20.93,22.89),
max_05=c(27.53,28.75,27.74,26.12,28.42),
max_06=c(31.88,33.4,32.29,31.09,33.46),
max_07=c(35.23,36.78,36.02,35.51,37.3),
max_08=c(34.68,36.15,35.56,35.4,36.61),
max_09=c(32.44,32.97,32.3,32.31,33.11),
max_10=c(26.66,26.94,26.27,26.22,26.87),
max_11=c(17.96,19.2,19.08,19.06,18.51),
max_12=c(13.06,14.12,14.74,14.17,13.26))

运行 的等式是:

sample01-02-01 = (sample01$max_01-sample02$min_01)/SQRT((sample01$max_01-sample01$min_01)*(sample02$max_01-sample02$min_01))

sample01-02-02 = (sample01$max_02-sample02$min_02)/SQRT((sample01$max_02-sample01$min_02)*(sample02$max_02-sample02$min_02))

sample01-02-03 = (sample01$max_03... etc

...每对总共 12 个(直到 sample01-02-12),求和以生成成对输出矩阵的单个值。

如有任何帮助,我们将不胜感激!

这是一个镜头,使用了 tidyr 的一些重塑和分组操作:

library(dplyr)
library(tidyr)
longdf <- df %>%
  pivot_longer(
    cols = min_01:max_12,
    names_to = c(".value", "set"),
    names_pattern = "(.*)_(.*)"
  )
longdf
# # A tibble: 60 x 5
#    sample   start set     min   max
#    <fct>    <dbl> <chr> <dbl> <dbl>
#  1 sample01   233 01     2.94  13.1
#  2 sample01   233 02     4.22  15.8
#  3 sample01   233 03     5.7   18.5
#  4 sample01   233 04     7.15  22.9
#  5 sample01   233 05    10.5   27.5
#  6 sample01   233 06    13.8   31.9
#  7 sample01   233 07    16.2   35.2
#  8 sample01   233 08    15.3   34.7
#  9 sample01   233 09    14.3   32.4
# 10 sample01   233 10     9.83  26.7
# # ... with 50 more rows

从这里开始,让我们以笛卡尔方式将其重新加入自身,删除自我比较并确保集合相同(min_01 与另一个样本的 min_01):

crossing(
  rename_all(longdf, ~ paste0(., "1")),
  rename_all(longdf, ~ paste0(., "2"))
) %>% 
  filter(sample1 != sample2, set1 == set2)
# # A tibble: 240 x 10
#    sample1  start1 set1   min1  max1 sample2  start2 set2   min2  max2
#    <fct>     <dbl> <chr> <dbl> <dbl> <fct>     <dbl> <chr> <dbl> <dbl>
#  1 sample01    233 01     2.94  13.1 sample02     99 01     3.26  14.1
#  2 sample01    233 01     2.94  13.1 sample03    288 01     3.15  14.9
#  3 sample01    233 01     2.94  13.1 sample04    313 01     2.55  14.5
#  4 sample01    233 01     2.94  13.1 sample05    346 01     2.59  13.3
#  5 sample01    233 02     4.22  15.8 sample02     99 02     4.97  16.9
#  6 sample01    233 02     4.22  15.8 sample03    288 02     3.51  16.9
#  7 sample01    233 02     4.22  15.8 sample04    313 02     4.14  16.4
#  8 sample01    233 02     4.22  15.8 sample05    346 02     4.12  15.7
#  9 sample01    233 03     5.7   18.5 sample02     99 03     6.61  19.8
# 10 sample01    233 03     5.7   18.5 sample03    288 03     4.86  19.2
# # ... with 230 more rows

从这里开始,"just"一些数学运算:

crossing(
  rename_all(longdf, ~ paste0(., "1")),
  rename_all(longdf, ~ paste0(., "2"))
) %>% 
  filter(sample1 != sample2, set1 == set2) %>%
  mutate(
    out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
  )
# # A tibble: 240 x 11
#    sample1  start1 set1   min1  max1 sample2  start2 set2   min2  max2   out
#    <fct>     <dbl> <chr> <dbl> <dbl> <fct>     <dbl> <chr> <dbl> <dbl> <dbl>
#  1 sample01    233 01     2.94  13.1 sample02     99 01     3.26  14.1 0.939
#  2 sample01    233 01     2.94  13.1 sample03    288 01     3.15  14.9 0.911
#  3 sample01    233 01     2.94  13.1 sample04    313 01     2.55  14.5 0.960
#  4 sample01    233 01     2.94  13.1 sample05    346 01     2.59  13.3 1.01 
#  5 sample01    233 02     4.22  15.8 sample02     99 02     4.97  16.9 0.922
#  6 sample01    233 02     4.22  15.8 sample03    288 02     3.51  16.9 0.990
#  7 sample01    233 02     4.22  15.8 sample04    313 02     4.14  16.4 0.982
#  8 sample01    233 02     4.22  15.8 sample05    346 02     4.12  15.7 1.01 
#  9 sample01    233 03     5.7   18.5 sample02     99 03     6.61  19.8 0.916
# 10 sample01    233 03     5.7   18.5 sample03    288 03     4.86  19.2 1.01 
# # ... with 230 more rows

...并按组求和:

crossing(
  rename_all(longdf, ~ paste0(., "1")),
  rename_all(longdf, ~ paste0(., "2"))
) %>% 
  filter(sample1 != sample2, set1 == set2) %>%
  mutate(
    out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
  ) %>%
  group_by(sample1, sample2) %>%
  summarize(out = sum(out)) %>%
  ungroup()
# # A tibble: 20 x 3
#    sample1  sample2    out
#    <fct>    <fct>    <dbl>
#  1 sample01 sample02  11.1
#  2 sample01 sample03  11.9
#  3 sample01 sample04  11.8
#  4 sample01 sample05  11.8
#  5 sample02 sample01  12.9
#  6 sample02 sample03  12.8
#  7 sample02 sample04  12.7
#  8 sample02 sample05  12.6
#  9 sample03 sample01  12.1
# 10 sample03 sample02  11.3
# 11 sample03 sample04  12.0
# 12 sample03 sample05  11.9
# 13 sample04 sample01  12.2
# 14 sample04 sample02  11.3
# 15 sample04 sample03  12.1
# 16 sample04 sample05  11.9
# 17 sample05 sample01  12.3
# 18 sample05 sample02  11.4
# 19 sample05 sample03  12.1
# 20 sample05 sample04  12.1

如果您需要它们采用类似矩阵的布局,那么

crossing(
  rename_all(longdf, ~ paste0(., "1")),
  rename_all(longdf, ~ paste0(., "2"))
) %>% 
  filter(sample1 != sample2, set1 == set2) %>%
  mutate(
    out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
  ) %>%
  group_by(sample1, sample2) %>%
  summarize(out = sum(out)) %>%
  ungroup() %>%
  pivot_wider(sample1, names_from = "sample2", values_from = "out") %>%
  select(c("sample1", setdiff(sort(colnames(.)), "sample1")))
# # A tibble: 5 x 6
#   sample1  sample01 sample02 sample03 sample04 sample05
#   <fct>       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
# 1 sample01     NA       11.1     11.9     11.8     11.8
# 2 sample02     12.9     NA       12.8     12.7     12.6
# 3 sample03     12.1     11.3     NA       12.0     11.9
# 4 sample04     12.2     11.3     12.1     NA       11.9
# 5 sample05     12.3     11.4     12.1     12.1     NA  

这里是没有额外依赖的尝试。它使用 combn 生成所有个体对,然后使用 apply 和自定义值函数计算所需值。

# Helpers
run_eqn <- function(sample_1, sample_2, eqn_ind) {
  # eqn_ind is a character string, e.g. "01", "02", ..., "12"
  max_id <- paste0("max_", eqn_ind)
  min_id <- paste0("min_", eqn_ind)
  (sample_1[[max_id]] - sample_2[[min_id]]) / sqrt((sample_1[[max_id]] - sample_1[[min_id]]) * (sample_2[[max_id]] - sample_2[[min_id]]))
}

sum_eqn <- function(sample_1, sample_2) {
  eqn_ind <- c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12")
  sum(sapply(eqn_ind, run_eqn, sample_1 = sample_1, sample_2 = sample_2))
}

# Main
pairs <- t(combn(1:5, 2))
pairs_value <- apply(pairs, 1, function(pair) {
  sum_eqn(df[pair[1], ], df[pair[2], ])
})
res <- cbind(pairs, pairs_value)
colnames(res) <- c("sample_1_id", "sample_2_id", "value")

res
# sample_1_id sample_2_id    value
# [1,]           1           2 11.11671
# [2,]           1           3 11.89706
# [3,]           1           4 11.84859
# [4,]           1           5 11.75426
# [5,]           2           3 12.75344
# [6,]           2           4 12.73372
# [7,]           2           5 12.61932
# [8,]           3           4 11.96217
# [9,]           3           5 11.87361
# [10,]           4           5 11.91347