如何在 R 中按列 wilcox.test 和 fisher 精确执行组
How to perform column-wise wilcox.test and fisher exact on groups in R
我有 data.frame df1:
df1 <- data.frame(
En_ID = c("KNT00000000003", "KNT00000000005", "KNT00000000419",
"KNT00000000457", "KNT00000000460", "KNT00000000938",
"KNT00000000971", "KNT00000001036", "KNT00000001084",
"KNT00000001167" ),
`Nor1` = c(-0.834165161710272, 1.02199443531549,
-0.558658947885705, -0.390114219973209, -1.23551839713296,
3.11429434221998, 0.283932163407262, -1.16908518620064,
-0.597054772455507, -0.593624543273255),
`Nor2` = c(-1.18531035488942, 0.423719727339646, -1.23261719368372,
0.0855281133529292, -1.52366830232278, 3.36692586561211,
1.00323690950956, -0.000211248816114964, -4.74738483548391,
-0.318176231083024),
`Nor3` = c(-0.262659255267546, 1.3962481061442, -0.548673555705647,
-0.0149651083306594, -1.45458689193089, 2.54126941463459,
1.17711308509307, -1.19425284921181, 1.17788731755683,
-0.367897054652365 ),
`Nor4` = c(-0.840752912305256, 0.536548846040064, -0.277409459604357,
-0.241073614962264, -0.875313153342293, 1.61789645804321,
0.412287101096504, -1.11846661523232, -2.6274528854429,
-0.760452698231182),
`Tor1` = c(-0.968784779247286, -0.502809694119192, -0.231526399163731,
-0.530038395734114, -0.706006018337411, 3.58264357077653,
-0.127521010699219, 0.270523387217103, 1.68335644352003,
-0.314902131571829),
`Tor2` = c(-0.481754175843152, -0.440784040523259, -0.532975340622715,
-0.182089795101371, -0.564807490336052, 1.74119896504534,
-0.96169805631325, -0.721782763145306, -0.433459827401695,
-0.727495835245995 ),
`Tor3` = c(-0.889343429110847, 1.07937149728343, -0.215144871523998,
-0.92234350748557, -0.832108253417702, 2.02456082994848,
-0.0434322861759954, -0.523126561938426, -0.556984056084809,
-0.740331742513503),
`Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538,
-0.613568289061259, -1.92838339196505, 2.23393705735665,
0.635389543483408, -0.466053620529111, -1.50483745357134,
-1.33400859143521),
`Tor5` = c(-0.486388736112514, 0.789390852922639, -0.869434195504952,
-0.70405854858187, -1.16488184095428, 2.91497178849082,
-2.10331904053714, -0.571130459068143, -0.219526004620518,
-0.301435496557957)
)
如何获取按列 Wilcox.test 和 fisher 提取文本,将 Nor1、Nor2、Nor3 和 Nor4 列与每行的 Tor1、Tor2、Tor3、Tor4 和 Tor5 列进行比较。然后,我想在最后一列添加两个测试的 p 值输出,导致 df2:
df2 <- data.frame( En_ID = c("KNT00000000003", "KNT00000000005", "KNT00000000419", "KNT00000000457", "KNT00000000460", "KNT00000000938", "KNT00000000971", "KNT00000001036", "KNT00000001084", "KNT00000001167" ), `Nor1` = c(-0.834165161710272, 1.02199443531549, -0.558658947885705, -0.390114219973209, -1.23551839713296, 3.11429434221998, 0.283932163407262, -1.16908518620064, -0.597054772455507, -0.593624543273255), `Nor2` = c(-1.18531035488942, 0.423719727339646, -1.23261719368372, 0.0855281133529292, -1.52366830232278, 3.36692586561211, 1.00323690950956, -0.000211248816114964, -4.74738483548391, -0.318176231083024), `Nor3` = c(-0.262659255267546, 1.3962481061442, -0.548673555705647, -0.0149651083306594, -1.45458689193089, 2.54126941463459, 1.17711308509307, -1.19425284921181, 1.17788731755683, -0.367897054652365 ), `Nor4` = c(-0.840752912305256, 0.536548846040064, -0.277409459604357, -0.241073614962264, -0.875313153342293, 1.61789645804321, 0.412287101096504, -1.11846661523232, -2.6274528854429, -0.760452698231182), `Tor1` = c(-0.968784779247286, -0.502809694119192, -0.231526399163731, -0.530038395734114, -0.706006018337411, 3.58264357077653, -0.127521010699219, 0.270523387217103, 1.68335644352003, -0.314902131571829), `Tor2` = c(-0.481754175843152, -0.440784040523259, -0.532975340622715, -0.182089795101371, -0.564807490336052, 1.74119896504534, -0.96169805631325, -0.721782763145306, -0.433459827401695, -0.727495835245995 ), `Tor3` = c(-0.889343429110847, 1.07937149728343, -0.215144871523998, -0.92234350748557, -0.832108253417702, 2.02456082994848, -0.0434322861759954, -0.523126561938426, -0.556984056084809, -0.740331742513503), `Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, -0.613568289061259, -1.92838339196505, 2.23393705735665, 0.635389543483408, -0.466053620529111, -1.50483745357134, -1.33400859143521), `Tor5` = c(-0.486388736112514, 0.789390852922639, -0.869434195504952, -0.70405854858187, -1.16488184095428, 2.91497178849082, -2.10331904053714, -0.571130459068143, -0.219526004620518, -0.301435496557957),`Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, -0.613568289061259, -1.92838339196505, 2.23393705735665, 0.635389543483408, -0.466053620529111, -1.50483745357134, -1.33400859143521), `p-value-wilcox` = c(0.8, 0.3, 0.7, 0.8, 0.9, 0.8, 0.7, -0.5, -0.7, -0.9), `p-value-fisher` = c(0.1, 0.7, 0.3, 0.1, 0.5, 0.3, 0.9, -0.2, -0.9, -0.4) )
我在这里放置了虚拟 p 值以提供所需输出的轮廓。真实数据有>200列,但两组(Nor和Tor)样本数不等
我从下面提到的堆栈中找到了一些示例,并尝试复制它们但惨遭失败。
请帮帮我。
fisher.test
需要相同长度的数据,所以我假设您有相同数量的 Nor
和 Tor
列。
使用 dplyr
rowwise
你可以使用 -
library(dplyr)
df1 %>%
select(-Tor5) %>%
rowwise() %>%
mutate(p.value.wilcox = wilcox.test(c_across(starts_with('Nor')),
c_across(starts_with('Tor')))$p.value,
p.value.fisher = fisher.test(c_across(starts_with('Nor')),
c_across(starts_with('Tor')))$p.value)
或者在 base R 中,使用 apply
-
nor_cols <- grep('Nor', names(df1))
tor_cols <- grep('Tor', names(df1))[-5]
cbind(df1, t(apply(df1[-1], 1, function(x)
c(p.value.wilcox = wilcox.test(x[nor_cols], x[tor_cols])$p.value,
p.value.fisher = fisher.test(x[nor_cols], x[tor_cols])$p.value))))
Fisher 精确检验用于分类变量,我不确定您是否可以将其应用于连续数据。对于 Mann Whitney,您可以使用这个:
for (i in c(1:length(df1[,1]))){
test_list <- wilcox.test(as.numeric(df1[i,c(2:5)]), as.numeric(df1[i,6:10]), exact = FALSE)
df1[i,"p_val_MW"] <- test_list[[3]]
}
我们可以使用 collapse
中的 dapply
,这应该更快
library(collapse)
cbind(df1, dapply(slt(df1, -c(1, 10)), MARGIN = 1,
FUN = function(x) c(wilcox = wilcox.test(x[1:4], x[5:8])$p.value,
fisher = fisher.test(x[1:4], x[5:8])$p.value)))
En_ID Nor1 Nor2 Nor3 Nor4 Tor1 Tor2 Tor3 Tor4 Tor5 wilcox fisher
1 KNT00000000003 -0.8341652 -1.1853103549 -0.26265926 -0.8407529 -0.9687848 -0.4817542 -0.88934343 -0.8581416 -0.4863887 0.6857143 1
2 KNT00000000005 1.0219944 0.4237197273 1.39624811 0.5365488 -0.5028097 -0.4407840 1.07937150 1.8772872 0.7893909 0.8857143 1
3 KNT00000000419 -0.5586589 -1.2326171937 -0.54867356 -0.2774095 -0.2315264 -0.5329753 -0.21514487 -0.3810476 -0.8694342 0.1142857 1
4 KNT00000000457 -0.3901142 0.0855281134 -0.01496511 -0.2410736 -0.5300384 -0.1820898 -0.92234351 -0.6135683 -0.7040585 0.1142857 1
5 KNT00000000460 -1.2355184 -1.5236683023 -1.45458689 -0.8753132 -0.7060060 -0.5648075 -0.83210825 -1.9283834 -1.1648818 0.3428571 1
6 KNT00000000938 3.1142943 3.3669258656 2.54126941 1.6178965 3.5826436 1.7411990 2.02456083 2.2339371 2.9149718 0.8857143 1
7 KNT00000000971 0.2839322 1.0032369095 1.17711309 0.4122871 -0.1275210 -0.9616981 -0.04343229 0.6353895 -2.1033190 0.1142857 1
8 KNT00000001036 -1.1690852 -0.0002112488 -1.19425285 -1.1184666 0.2705234 -0.7217828 -0.52312656 -0.4660536 -0.5711305 0.2000000 1
9 KNT00000001084 -0.5970548 -4.7473848355 1.17788732 -2.6274529 1.6833564 -0.4334598 -0.55698406 -1.5048375 -0.2195260 0.3428571 1
10 KNT00000001167 -0.5936245 -0.3181762311 -0.36789705 -0.7604527 -0.3149021 -0.7274958 -0.74033174 -1.3340086 -0.3014355 0.6857143 1
我有 data.frame df1:
df1 <- data.frame(
En_ID = c("KNT00000000003", "KNT00000000005", "KNT00000000419",
"KNT00000000457", "KNT00000000460", "KNT00000000938",
"KNT00000000971", "KNT00000001036", "KNT00000001084",
"KNT00000001167" ),
`Nor1` = c(-0.834165161710272, 1.02199443531549,
-0.558658947885705, -0.390114219973209, -1.23551839713296,
3.11429434221998, 0.283932163407262, -1.16908518620064,
-0.597054772455507, -0.593624543273255),
`Nor2` = c(-1.18531035488942, 0.423719727339646, -1.23261719368372,
0.0855281133529292, -1.52366830232278, 3.36692586561211,
1.00323690950956, -0.000211248816114964, -4.74738483548391,
-0.318176231083024),
`Nor3` = c(-0.262659255267546, 1.3962481061442, -0.548673555705647,
-0.0149651083306594, -1.45458689193089, 2.54126941463459,
1.17711308509307, -1.19425284921181, 1.17788731755683,
-0.367897054652365 ),
`Nor4` = c(-0.840752912305256, 0.536548846040064, -0.277409459604357,
-0.241073614962264, -0.875313153342293, 1.61789645804321,
0.412287101096504, -1.11846661523232, -2.6274528854429,
-0.760452698231182),
`Tor1` = c(-0.968784779247286, -0.502809694119192, -0.231526399163731,
-0.530038395734114, -0.706006018337411, 3.58264357077653,
-0.127521010699219, 0.270523387217103, 1.68335644352003,
-0.314902131571829),
`Tor2` = c(-0.481754175843152, -0.440784040523259, -0.532975340622715,
-0.182089795101371, -0.564807490336052, 1.74119896504534,
-0.96169805631325, -0.721782763145306, -0.433459827401695,
-0.727495835245995 ),
`Tor3` = c(-0.889343429110847, 1.07937149728343, -0.215144871523998,
-0.92234350748557, -0.832108253417702, 2.02456082994848,
-0.0434322861759954, -0.523126561938426, -0.556984056084809,
-0.740331742513503),
`Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538,
-0.613568289061259, -1.92838339196505, 2.23393705735665,
0.635389543483408, -0.466053620529111, -1.50483745357134,
-1.33400859143521),
`Tor5` = c(-0.486388736112514, 0.789390852922639, -0.869434195504952,
-0.70405854858187, -1.16488184095428, 2.91497178849082,
-2.10331904053714, -0.571130459068143, -0.219526004620518,
-0.301435496557957)
)
如何获取按列 Wilcox.test 和 fisher 提取文本,将 Nor1、Nor2、Nor3 和 Nor4 列与每行的 Tor1、Tor2、Tor3、Tor4 和 Tor5 列进行比较。然后,我想在最后一列添加两个测试的 p 值输出,导致 df2:
df2 <- data.frame( En_ID = c("KNT00000000003", "KNT00000000005", "KNT00000000419", "KNT00000000457", "KNT00000000460", "KNT00000000938", "KNT00000000971", "KNT00000001036", "KNT00000001084", "KNT00000001167" ), `Nor1` = c(-0.834165161710272, 1.02199443531549, -0.558658947885705, -0.390114219973209, -1.23551839713296, 3.11429434221998, 0.283932163407262, -1.16908518620064, -0.597054772455507, -0.593624543273255), `Nor2` = c(-1.18531035488942, 0.423719727339646, -1.23261719368372, 0.0855281133529292, -1.52366830232278, 3.36692586561211, 1.00323690950956, -0.000211248816114964, -4.74738483548391, -0.318176231083024), `Nor3` = c(-0.262659255267546, 1.3962481061442, -0.548673555705647, -0.0149651083306594, -1.45458689193089, 2.54126941463459, 1.17711308509307, -1.19425284921181, 1.17788731755683, -0.367897054652365 ), `Nor4` = c(-0.840752912305256, 0.536548846040064, -0.277409459604357, -0.241073614962264, -0.875313153342293, 1.61789645804321, 0.412287101096504, -1.11846661523232, -2.6274528854429, -0.760452698231182), `Tor1` = c(-0.968784779247286, -0.502809694119192, -0.231526399163731, -0.530038395734114, -0.706006018337411, 3.58264357077653, -0.127521010699219, 0.270523387217103, 1.68335644352003, -0.314902131571829), `Tor2` = c(-0.481754175843152, -0.440784040523259, -0.532975340622715, -0.182089795101371, -0.564807490336052, 1.74119896504534, -0.96169805631325, -0.721782763145306, -0.433459827401695, -0.727495835245995 ), `Tor3` = c(-0.889343429110847, 1.07937149728343, -0.215144871523998, -0.92234350748557, -0.832108253417702, 2.02456082994848, -0.0434322861759954, -0.523126561938426, -0.556984056084809, -0.740331742513503), `Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, -0.613568289061259, -1.92838339196505, 2.23393705735665, 0.635389543483408, -0.466053620529111, -1.50483745357134, -1.33400859143521), `Tor5` = c(-0.486388736112514, 0.789390852922639, -0.869434195504952, -0.70405854858187, -1.16488184095428, 2.91497178849082, -2.10331904053714, -0.571130459068143, -0.219526004620518, -0.301435496557957),`Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, -0.613568289061259, -1.92838339196505, 2.23393705735665, 0.635389543483408, -0.466053620529111, -1.50483745357134, -1.33400859143521), `p-value-wilcox` = c(0.8, 0.3, 0.7, 0.8, 0.9, 0.8, 0.7, -0.5, -0.7, -0.9), `p-value-fisher` = c(0.1, 0.7, 0.3, 0.1, 0.5, 0.3, 0.9, -0.2, -0.9, -0.4) )
我在这里放置了虚拟 p 值以提供所需输出的轮廓。真实数据有>200列,但两组(Nor和Tor)样本数不等
我从下面提到的堆栈中找到了一些示例,并尝试复制它们但惨遭失败。
请帮帮我。
fisher.test
需要相同长度的数据,所以我假设您有相同数量的 Nor
和 Tor
列。
使用 dplyr
rowwise
你可以使用 -
library(dplyr)
df1 %>%
select(-Tor5) %>%
rowwise() %>%
mutate(p.value.wilcox = wilcox.test(c_across(starts_with('Nor')),
c_across(starts_with('Tor')))$p.value,
p.value.fisher = fisher.test(c_across(starts_with('Nor')),
c_across(starts_with('Tor')))$p.value)
或者在 base R 中,使用 apply
-
nor_cols <- grep('Nor', names(df1))
tor_cols <- grep('Tor', names(df1))[-5]
cbind(df1, t(apply(df1[-1], 1, function(x)
c(p.value.wilcox = wilcox.test(x[nor_cols], x[tor_cols])$p.value,
p.value.fisher = fisher.test(x[nor_cols], x[tor_cols])$p.value))))
Fisher 精确检验用于分类变量,我不确定您是否可以将其应用于连续数据。对于 Mann Whitney,您可以使用这个:
for (i in c(1:length(df1[,1]))){
test_list <- wilcox.test(as.numeric(df1[i,c(2:5)]), as.numeric(df1[i,6:10]), exact = FALSE)
df1[i,"p_val_MW"] <- test_list[[3]]
}
我们可以使用 collapse
中的 dapply
,这应该更快
library(collapse)
cbind(df1, dapply(slt(df1, -c(1, 10)), MARGIN = 1,
FUN = function(x) c(wilcox = wilcox.test(x[1:4], x[5:8])$p.value,
fisher = fisher.test(x[1:4], x[5:8])$p.value)))
En_ID Nor1 Nor2 Nor3 Nor4 Tor1 Tor2 Tor3 Tor4 Tor5 wilcox fisher
1 KNT00000000003 -0.8341652 -1.1853103549 -0.26265926 -0.8407529 -0.9687848 -0.4817542 -0.88934343 -0.8581416 -0.4863887 0.6857143 1
2 KNT00000000005 1.0219944 0.4237197273 1.39624811 0.5365488 -0.5028097 -0.4407840 1.07937150 1.8772872 0.7893909 0.8857143 1
3 KNT00000000419 -0.5586589 -1.2326171937 -0.54867356 -0.2774095 -0.2315264 -0.5329753 -0.21514487 -0.3810476 -0.8694342 0.1142857 1
4 KNT00000000457 -0.3901142 0.0855281134 -0.01496511 -0.2410736 -0.5300384 -0.1820898 -0.92234351 -0.6135683 -0.7040585 0.1142857 1
5 KNT00000000460 -1.2355184 -1.5236683023 -1.45458689 -0.8753132 -0.7060060 -0.5648075 -0.83210825 -1.9283834 -1.1648818 0.3428571 1
6 KNT00000000938 3.1142943 3.3669258656 2.54126941 1.6178965 3.5826436 1.7411990 2.02456083 2.2339371 2.9149718 0.8857143 1
7 KNT00000000971 0.2839322 1.0032369095 1.17711309 0.4122871 -0.1275210 -0.9616981 -0.04343229 0.6353895 -2.1033190 0.1142857 1
8 KNT00000001036 -1.1690852 -0.0002112488 -1.19425285 -1.1184666 0.2705234 -0.7217828 -0.52312656 -0.4660536 -0.5711305 0.2000000 1
9 KNT00000001084 -0.5970548 -4.7473848355 1.17788732 -2.6274529 1.6833564 -0.4334598 -0.55698406 -1.5048375 -0.2195260 0.3428571 1
10 KNT00000001167 -0.5936245 -0.3181762311 -0.36789705 -0.7604527 -0.3149021 -0.7274958 -0.74033174 -1.3340086 -0.3014355 0.6857143 1