运行 数据帧中所有对的方程,输出矩阵
Running a equation on all pairs in dataframe, output matrix
我有一个数据框,其中的行对应于每个人的测量值。我需要 运行 所有个体对的方程式,并将结果输出为矩阵,其值对应于每对个体的输出。
数据如下所示:
df = data.frame(
sample=c("sample01","sample02","sample03","sample04","sample05"),
start=c(233,99,288,313,346),
min_01=c(2.94,3.26,3.15,2.55,2.59),
min_02=c(4.22,4.97,3.51,4.14,4.12),
min_03=c(5.7,6.61,4.86,5.44,5.47),
min_04=c(7.15,8.26,6.3,7.14,7.04),
min_05=c(10.52,11.9,9.7,10.49,10.25),
min_06=c(13.81,15.51,13.02,14.55,14.62),
min_07=c(16.15,18.98,16.63,18.19,17.49),
min_08=c(15.34,18.43,15.83,17.86,17.08),
min_09=c(14.27,15.59,13.27,14.87,14.6),
min_10=c(9.83,10.9,9,10.14,9.83),
min_11=c(5.53,5.95,4.31,5.26,5.18),
min_12=c(3.12,2.98,2.96,2.35,2.3),
max_01=c(13.13,14.1,14.92,14.46,13.34),
max_02=c(15.83,16.92,16.86,16.35,15.74),
max_03=c(18.49,19.75,19.23,18.99,18.47),
max_04=c(22.86,23.46,22.99,20.93,22.89),
max_05=c(27.53,28.75,27.74,26.12,28.42),
max_06=c(31.88,33.4,32.29,31.09,33.46),
max_07=c(35.23,36.78,36.02,35.51,37.3),
max_08=c(34.68,36.15,35.56,35.4,36.61),
max_09=c(32.44,32.97,32.3,32.31,33.11),
max_10=c(26.66,26.94,26.27,26.22,26.87),
max_11=c(17.96,19.2,19.08,19.06,18.51),
max_12=c(13.06,14.12,14.74,14.17,13.26))
运行 的等式是:
sample01-02-01 = (sample01$max_01-sample02$min_01)/SQRT((sample01$max_01-sample01$min_01)*(sample02$max_01-sample02$min_01))
sample01-02-02 = (sample01$max_02-sample02$min_02)/SQRT((sample01$max_02-sample01$min_02)*(sample02$max_02-sample02$min_02))
sample01-02-03 = (sample01$max_03... etc
...每对总共 12 个(直到 sample01-02-12),求和以生成成对输出矩阵的单个值。
如有任何帮助,我们将不胜感激!
这是一个镜头,使用了 tidyr
的一些重塑和分组操作:
library(dplyr)
library(tidyr)
longdf <- df %>%
pivot_longer(
cols = min_01:max_12,
names_to = c(".value", "set"),
names_pattern = "(.*)_(.*)"
)
longdf
# # A tibble: 60 x 5
# sample start set min max
# <fct> <dbl> <chr> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1
# 2 sample01 233 02 4.22 15.8
# 3 sample01 233 03 5.7 18.5
# 4 sample01 233 04 7.15 22.9
# 5 sample01 233 05 10.5 27.5
# 6 sample01 233 06 13.8 31.9
# 7 sample01 233 07 16.2 35.2
# 8 sample01 233 08 15.3 34.7
# 9 sample01 233 09 14.3 32.4
# 10 sample01 233 10 9.83 26.7
# # ... with 50 more rows
从这里开始,让我们以笛卡尔方式将其重新加入自身,删除自我比较并确保集合相同(min_01
与另一个样本的 min_01
):
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2)
# # A tibble: 240 x 10
# sample1 start1 set1 min1 max1 sample2 start2 set2 min2 max2
# <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <chr> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1 sample02 99 01 3.26 14.1
# 2 sample01 233 01 2.94 13.1 sample03 288 01 3.15 14.9
# 3 sample01 233 01 2.94 13.1 sample04 313 01 2.55 14.5
# 4 sample01 233 01 2.94 13.1 sample05 346 01 2.59 13.3
# 5 sample01 233 02 4.22 15.8 sample02 99 02 4.97 16.9
# 6 sample01 233 02 4.22 15.8 sample03 288 02 3.51 16.9
# 7 sample01 233 02 4.22 15.8 sample04 313 02 4.14 16.4
# 8 sample01 233 02 4.22 15.8 sample05 346 02 4.12 15.7
# 9 sample01 233 03 5.7 18.5 sample02 99 03 6.61 19.8
# 10 sample01 233 03 5.7 18.5 sample03 288 03 4.86 19.2
# # ... with 230 more rows
从这里开始,"just"一些数学运算:
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
)
# # A tibble: 240 x 11
# sample1 start1 set1 min1 max1 sample2 start2 set2 min2 max2 out
# <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1 sample02 99 01 3.26 14.1 0.939
# 2 sample01 233 01 2.94 13.1 sample03 288 01 3.15 14.9 0.911
# 3 sample01 233 01 2.94 13.1 sample04 313 01 2.55 14.5 0.960
# 4 sample01 233 01 2.94 13.1 sample05 346 01 2.59 13.3 1.01
# 5 sample01 233 02 4.22 15.8 sample02 99 02 4.97 16.9 0.922
# 6 sample01 233 02 4.22 15.8 sample03 288 02 3.51 16.9 0.990
# 7 sample01 233 02 4.22 15.8 sample04 313 02 4.14 16.4 0.982
# 8 sample01 233 02 4.22 15.8 sample05 346 02 4.12 15.7 1.01
# 9 sample01 233 03 5.7 18.5 sample02 99 03 6.61 19.8 0.916
# 10 sample01 233 03 5.7 18.5 sample03 288 03 4.86 19.2 1.01
# # ... with 230 more rows
...并按组求和:
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
) %>%
group_by(sample1, sample2) %>%
summarize(out = sum(out)) %>%
ungroup()
# # A tibble: 20 x 3
# sample1 sample2 out
# <fct> <fct> <dbl>
# 1 sample01 sample02 11.1
# 2 sample01 sample03 11.9
# 3 sample01 sample04 11.8
# 4 sample01 sample05 11.8
# 5 sample02 sample01 12.9
# 6 sample02 sample03 12.8
# 7 sample02 sample04 12.7
# 8 sample02 sample05 12.6
# 9 sample03 sample01 12.1
# 10 sample03 sample02 11.3
# 11 sample03 sample04 12.0
# 12 sample03 sample05 11.9
# 13 sample04 sample01 12.2
# 14 sample04 sample02 11.3
# 15 sample04 sample03 12.1
# 16 sample04 sample05 11.9
# 17 sample05 sample01 12.3
# 18 sample05 sample02 11.4
# 19 sample05 sample03 12.1
# 20 sample05 sample04 12.1
如果您需要它们采用类似矩阵的布局,那么
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
) %>%
group_by(sample1, sample2) %>%
summarize(out = sum(out)) %>%
ungroup() %>%
pivot_wider(sample1, names_from = "sample2", values_from = "out") %>%
select(c("sample1", setdiff(sort(colnames(.)), "sample1")))
# # A tibble: 5 x 6
# sample1 sample01 sample02 sample03 sample04 sample05
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 sample01 NA 11.1 11.9 11.8 11.8
# 2 sample02 12.9 NA 12.8 12.7 12.6
# 3 sample03 12.1 11.3 NA 12.0 11.9
# 4 sample04 12.2 11.3 12.1 NA 11.9
# 5 sample05 12.3 11.4 12.1 12.1 NA
这里是没有额外依赖的尝试。它使用 combn
生成所有个体对,然后使用 apply
和自定义值函数计算所需值。
# Helpers
run_eqn <- function(sample_1, sample_2, eqn_ind) {
# eqn_ind is a character string, e.g. "01", "02", ..., "12"
max_id <- paste0("max_", eqn_ind)
min_id <- paste0("min_", eqn_ind)
(sample_1[[max_id]] - sample_2[[min_id]]) / sqrt((sample_1[[max_id]] - sample_1[[min_id]]) * (sample_2[[max_id]] - sample_2[[min_id]]))
}
sum_eqn <- function(sample_1, sample_2) {
eqn_ind <- c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12")
sum(sapply(eqn_ind, run_eqn, sample_1 = sample_1, sample_2 = sample_2))
}
# Main
pairs <- t(combn(1:5, 2))
pairs_value <- apply(pairs, 1, function(pair) {
sum_eqn(df[pair[1], ], df[pair[2], ])
})
res <- cbind(pairs, pairs_value)
colnames(res) <- c("sample_1_id", "sample_2_id", "value")
res
# sample_1_id sample_2_id value
# [1,] 1 2 11.11671
# [2,] 1 3 11.89706
# [3,] 1 4 11.84859
# [4,] 1 5 11.75426
# [5,] 2 3 12.75344
# [6,] 2 4 12.73372
# [7,] 2 5 12.61932
# [8,] 3 4 11.96217
# [9,] 3 5 11.87361
# [10,] 4 5 11.91347
我有一个数据框,其中的行对应于每个人的测量值。我需要 运行 所有个体对的方程式,并将结果输出为矩阵,其值对应于每对个体的输出。
数据如下所示:
df = data.frame(
sample=c("sample01","sample02","sample03","sample04","sample05"),
start=c(233,99,288,313,346),
min_01=c(2.94,3.26,3.15,2.55,2.59),
min_02=c(4.22,4.97,3.51,4.14,4.12),
min_03=c(5.7,6.61,4.86,5.44,5.47),
min_04=c(7.15,8.26,6.3,7.14,7.04),
min_05=c(10.52,11.9,9.7,10.49,10.25),
min_06=c(13.81,15.51,13.02,14.55,14.62),
min_07=c(16.15,18.98,16.63,18.19,17.49),
min_08=c(15.34,18.43,15.83,17.86,17.08),
min_09=c(14.27,15.59,13.27,14.87,14.6),
min_10=c(9.83,10.9,9,10.14,9.83),
min_11=c(5.53,5.95,4.31,5.26,5.18),
min_12=c(3.12,2.98,2.96,2.35,2.3),
max_01=c(13.13,14.1,14.92,14.46,13.34),
max_02=c(15.83,16.92,16.86,16.35,15.74),
max_03=c(18.49,19.75,19.23,18.99,18.47),
max_04=c(22.86,23.46,22.99,20.93,22.89),
max_05=c(27.53,28.75,27.74,26.12,28.42),
max_06=c(31.88,33.4,32.29,31.09,33.46),
max_07=c(35.23,36.78,36.02,35.51,37.3),
max_08=c(34.68,36.15,35.56,35.4,36.61),
max_09=c(32.44,32.97,32.3,32.31,33.11),
max_10=c(26.66,26.94,26.27,26.22,26.87),
max_11=c(17.96,19.2,19.08,19.06,18.51),
max_12=c(13.06,14.12,14.74,14.17,13.26))
运行 的等式是:
sample01-02-01 = (sample01$max_01-sample02$min_01)/SQRT((sample01$max_01-sample01$min_01)*(sample02$max_01-sample02$min_01))
sample01-02-02 = (sample01$max_02-sample02$min_02)/SQRT((sample01$max_02-sample01$min_02)*(sample02$max_02-sample02$min_02))
sample01-02-03 = (sample01$max_03... etc
...每对总共 12 个(直到 sample01-02-12),求和以生成成对输出矩阵的单个值。
如有任何帮助,我们将不胜感激!
这是一个镜头,使用了 tidyr
的一些重塑和分组操作:
library(dplyr)
library(tidyr)
longdf <- df %>%
pivot_longer(
cols = min_01:max_12,
names_to = c(".value", "set"),
names_pattern = "(.*)_(.*)"
)
longdf
# # A tibble: 60 x 5
# sample start set min max
# <fct> <dbl> <chr> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1
# 2 sample01 233 02 4.22 15.8
# 3 sample01 233 03 5.7 18.5
# 4 sample01 233 04 7.15 22.9
# 5 sample01 233 05 10.5 27.5
# 6 sample01 233 06 13.8 31.9
# 7 sample01 233 07 16.2 35.2
# 8 sample01 233 08 15.3 34.7
# 9 sample01 233 09 14.3 32.4
# 10 sample01 233 10 9.83 26.7
# # ... with 50 more rows
从这里开始,让我们以笛卡尔方式将其重新加入自身,删除自我比较并确保集合相同(min_01
与另一个样本的 min_01
):
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2)
# # A tibble: 240 x 10
# sample1 start1 set1 min1 max1 sample2 start2 set2 min2 max2
# <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <chr> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1 sample02 99 01 3.26 14.1
# 2 sample01 233 01 2.94 13.1 sample03 288 01 3.15 14.9
# 3 sample01 233 01 2.94 13.1 sample04 313 01 2.55 14.5
# 4 sample01 233 01 2.94 13.1 sample05 346 01 2.59 13.3
# 5 sample01 233 02 4.22 15.8 sample02 99 02 4.97 16.9
# 6 sample01 233 02 4.22 15.8 sample03 288 02 3.51 16.9
# 7 sample01 233 02 4.22 15.8 sample04 313 02 4.14 16.4
# 8 sample01 233 02 4.22 15.8 sample05 346 02 4.12 15.7
# 9 sample01 233 03 5.7 18.5 sample02 99 03 6.61 19.8
# 10 sample01 233 03 5.7 18.5 sample03 288 03 4.86 19.2
# # ... with 230 more rows
从这里开始,"just"一些数学运算:
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
)
# # A tibble: 240 x 11
# sample1 start1 set1 min1 max1 sample2 start2 set2 min2 max2 out
# <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1 sample02 99 01 3.26 14.1 0.939
# 2 sample01 233 01 2.94 13.1 sample03 288 01 3.15 14.9 0.911
# 3 sample01 233 01 2.94 13.1 sample04 313 01 2.55 14.5 0.960
# 4 sample01 233 01 2.94 13.1 sample05 346 01 2.59 13.3 1.01
# 5 sample01 233 02 4.22 15.8 sample02 99 02 4.97 16.9 0.922
# 6 sample01 233 02 4.22 15.8 sample03 288 02 3.51 16.9 0.990
# 7 sample01 233 02 4.22 15.8 sample04 313 02 4.14 16.4 0.982
# 8 sample01 233 02 4.22 15.8 sample05 346 02 4.12 15.7 1.01
# 9 sample01 233 03 5.7 18.5 sample02 99 03 6.61 19.8 0.916
# 10 sample01 233 03 5.7 18.5 sample03 288 03 4.86 19.2 1.01
# # ... with 230 more rows
...并按组求和:
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
) %>%
group_by(sample1, sample2) %>%
summarize(out = sum(out)) %>%
ungroup()
# # A tibble: 20 x 3
# sample1 sample2 out
# <fct> <fct> <dbl>
# 1 sample01 sample02 11.1
# 2 sample01 sample03 11.9
# 3 sample01 sample04 11.8
# 4 sample01 sample05 11.8
# 5 sample02 sample01 12.9
# 6 sample02 sample03 12.8
# 7 sample02 sample04 12.7
# 8 sample02 sample05 12.6
# 9 sample03 sample01 12.1
# 10 sample03 sample02 11.3
# 11 sample03 sample04 12.0
# 12 sample03 sample05 11.9
# 13 sample04 sample01 12.2
# 14 sample04 sample02 11.3
# 15 sample04 sample03 12.1
# 16 sample04 sample05 11.9
# 17 sample05 sample01 12.3
# 18 sample05 sample02 11.4
# 19 sample05 sample03 12.1
# 20 sample05 sample04 12.1
如果您需要它们采用类似矩阵的布局,那么
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
) %>%
group_by(sample1, sample2) %>%
summarize(out = sum(out)) %>%
ungroup() %>%
pivot_wider(sample1, names_from = "sample2", values_from = "out") %>%
select(c("sample1", setdiff(sort(colnames(.)), "sample1")))
# # A tibble: 5 x 6
# sample1 sample01 sample02 sample03 sample04 sample05
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 sample01 NA 11.1 11.9 11.8 11.8
# 2 sample02 12.9 NA 12.8 12.7 12.6
# 3 sample03 12.1 11.3 NA 12.0 11.9
# 4 sample04 12.2 11.3 12.1 NA 11.9
# 5 sample05 12.3 11.4 12.1 12.1 NA
这里是没有额外依赖的尝试。它使用 combn
生成所有个体对,然后使用 apply
和自定义值函数计算所需值。
# Helpers
run_eqn <- function(sample_1, sample_2, eqn_ind) {
# eqn_ind is a character string, e.g. "01", "02", ..., "12"
max_id <- paste0("max_", eqn_ind)
min_id <- paste0("min_", eqn_ind)
(sample_1[[max_id]] - sample_2[[min_id]]) / sqrt((sample_1[[max_id]] - sample_1[[min_id]]) * (sample_2[[max_id]] - sample_2[[min_id]]))
}
sum_eqn <- function(sample_1, sample_2) {
eqn_ind <- c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12")
sum(sapply(eqn_ind, run_eqn, sample_1 = sample_1, sample_2 = sample_2))
}
# Main
pairs <- t(combn(1:5, 2))
pairs_value <- apply(pairs, 1, function(pair) {
sum_eqn(df[pair[1], ], df[pair[2], ])
})
res <- cbind(pairs, pairs_value)
colnames(res) <- c("sample_1_id", "sample_2_id", "value")
res
# sample_1_id sample_2_id value
# [1,] 1 2 11.11671
# [2,] 1 3 11.89706
# [3,] 1 4 11.84859
# [4,] 1 5 11.75426
# [5,] 2 3 12.75344
# [6,] 2 4 12.73372
# [7,] 2 5 12.61932
# [8,] 3 4 11.96217
# [9,] 3 5 11.87361
# [10,] 4 5 11.91347